FunASR/funasr/datasets/audio_datasets/update_jsonl.py

100 lines
3.5 KiB
Python

import os
import json
import torch
import logging
import hydra
from omegaconf import DictConfig, OmegaConf
import concurrent.futures
import librosa
import torch.distributed as dist
import threading
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
def gen_scp_from_jsonl(jsonl_file, jsonl_file_out, ncpu):
jsonl_file_out_f = open(jsonl_file_out, "w")
with open(jsonl_file, encoding="utf-8") as fin:
lines = fin.readlines()
num_total = len(lines)
if ncpu > 1:
# 使用ThreadPoolExecutor限制并发线程数
with ThreadPoolExecutor(max_workers=ncpu) as executor:
# 提交任务到线程池
futures = {executor.submit(update_data, lines, i) for i in tqdm(range(num_total))}
# 等待所有任务完成,这会阻塞直到所有提交的任务完成
for future in concurrent.futures.as_completed(futures):
# 这里可以添加额外的逻辑来处理完成的任务,但在这个例子中我们只是等待
pass
else:
for i in range(num_total):
update_data(lines, i)
logging.info("All audio durations have been processed.")
for line in lines:
jsonl_file_out_f.write(line + "\n")
jsonl_file_out_f.flush()
jsonl_file_out_f.close()
def update_data(lines, i):
line = lines[i]
data = json.loads(line.strip())
wav_path = data["source"].replace("/cpfs01", "/cpfs_speech/data")
if os.path.exists(wav_path):
waveform, _ = librosa.load(wav_path, sr=16000)
sample_num = len(waveform)
source_len = int(sample_num / 16000 * 1000 / 10)
source_len_old = data["source_len"]
# if (source_len_old - source_len) > 100 or (source_len - source_len_old) > 100:
# logging.info(f"old: {source_len_old}, new: {source_len}, wav: {wav_path}")
data["source_len"] = source_len
data["source"] = wav_path
jsonl_line = json.dumps(data, ensure_ascii=False)
lines[i] = jsonl_line
def update_wav_len(jsonl_file_list_in, jsonl_file_out_dir, ncpu=1):
os.makedirs(jsonl_file_out_dir, exist_ok=True)
with open(jsonl_file_list_in, "r") as f:
data_file_lists = f.readlines()
for i, jsonl in enumerate(data_file_lists):
filename_with_extension = os.path.basename(jsonl.strip())
jsonl_file_out = os.path.join(jsonl_file_out_dir, filename_with_extension)
logging.info(f"{i}/{len(data_file_lists)}, jsonl: {jsonl}, {jsonl_file_out}")
gen_scp_from_jsonl(jsonl.strip(), jsonl_file_out, ncpu)
@hydra.main(config_name=None, version_base=None)
def main_hydra(cfg: DictConfig):
kwargs = OmegaConf.to_container(cfg, resolve=True)
logging.info(kwargs)
jsonl_file_list_in = kwargs.get(
"jsonl_file_list_in", "/Users/zhifu/funasr1.0/data/list/data_jsonl.list"
)
jsonl_file_out_dir = kwargs.get("jsonl_file_out_dir", "/Users/zhifu/funasr1.0/data_tmp")
ncpu = kwargs.get("ncpu", 1)
update_wav_len(jsonl_file_list_in, jsonl_file_out_dir, ncpu)
# gen_scp_from_jsonl(jsonl_file_list_in, jsonl_file_out_dir)
"""
python -m funasr.datasets.audio_datasets.json2scp \
++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
"""
if __name__ == "__main__":
main_hydra()