36 lines
726 B
Bash
36 lines
726 B
Bash
|
#!/usr/bin/env bash
|
||
|
|
||
|
|
||
|
# Begin configuration section.
|
||
|
nj=32
|
||
|
cmd=utils/run.pl
|
||
|
|
||
|
echo "$0 $@"
|
||
|
|
||
|
. utils/parse_options.sh || exit 1;
|
||
|
|
||
|
# tokenize configuration
|
||
|
text_dir=$1
|
||
|
seg_file=$2
|
||
|
logdir=$3
|
||
|
output_dir=$4
|
||
|
|
||
|
txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
|
||
|
mkdir -p ${logdir}
|
||
|
|
||
|
$cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
|
||
|
python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
|
||
|
-s ${seg_file} -i JOB -o ${txt_dir} \
|
||
|
|| exit 1;
|
||
|
|
||
|
# concatenate the text files together.
|
||
|
for n in $(seq $nj); do
|
||
|
cat ${txt_dir}/text.$n.txt || exit 1
|
||
|
done > ${output_dir}/text || exit 1
|
||
|
|
||
|
for n in $(seq $nj); do
|
||
|
cat ${txt_dir}/len.$n || exit 1
|
||
|
done > ${output_dir}/text_shape || exit 1
|
||
|
|
||
|
echo "$0: Succeeded text tokenize"
|