FunASR/funasr/datasets/audio_datasets/espnet_samplers.py

import torch
import numpy as np
import logging
import math
import torch.distributed as dist
from torch.utils.data import DistributedSampler
from torch.utils.data import BatchSampler, Sampler
import torch.distributed as dist
import random
from funasr.register import tables


@tables.register("batch_sampler_classes", "EspnetStyleBatchSampler")
def EspnetStyleBatchSampler_fn(dataset, **kwargs):
    dataloader_args = {}

    batch_sampler = EspnetStyleBatchSampler(dataset, **kwargs)
    dataloader_args["batch_sampler"] = batch_sampler
    dataloader_args["num_workers"] = kwargs.get("num_workers", 4)
    dataloader_args["pin_memory"] = kwargs.get("pin_memory", True)

    return dataloader_args


import torch
from torch.utils.data import Dataset, DistributedSampler
import math
import random


class EspnetStyleBatchSampler(DistributedSampler):
    def __init__(
        self,
        dataset,
        batch_size,
        batch_type="token",
        rank=None,
        num_replicas=None,
        rank_split=False,
        shuffle=True,
        drop_last=False,
        is_training: bool = True,
        sort_size: int = 1024,
        start_step: int = 0,
        **kwargs,
    ):

        try:
            rank = dist.get_rank()
            num_replicas = dist.get_world_size()
        except:
            rank = 0
            num_replicas = 1
        # if rank_split:
        #     logging.info(f"Warning, rank_split: {rank_split}, batch and shuffle data in local rank")
        #     rank = 0
        #     num_replicas = 1
        self.rank = rank
        self.num_replicas = num_replicas
        self.dataset = dataset
        self.batch_size = batch_size
        self.batch_type = batch_type
        self.is_training = is_training
        self.shuffle = shuffle and is_training
        self.drop_last = drop_last

        self.total_size = len(self.dataset)
        self.num_samples = int(math.ceil(self.total_size / self.num_replicas))
        self.epoch = 0
        self.sort_size = sort_size * num_replicas
        self.max_token_length = kwargs.get("max_token_length", 2048)
        self.min_token_length = kwargs.get("min_token_length", 0)
        self.length_scale_source = kwargs.get("length_scale_source", 1.0)
        self.start_step = start_step
        if self.start_step > 0:
            logging.info(f"Warning, start_step > 0, dataloader start from step: {self.start_step}")
        # super().__init__(dataset, num_replicas=num_replicas, rank=rank,
        #                  shuffle=shuffle, drop_last=drop_last)

    def __iter__(self):
        if self.shuffle:
            g = torch.Generator()
            g.manual_seed(self.epoch)
            random.seed(self.epoch)
            indices = torch.randperm(len(self.dataset), generator=g).tolist()
        else:
            indices = list(range(len(self.dataset)))

        # Sort indices by sample length
        sorted_indices = sorted(indices, key=lambda idx: self.dataset.get_source_len(idx))

        # Organize batches based on 'length' or 'example'
        buffer_batches = []
        batch = []
        max_len_in_batch = 0  # Tracks the max sample length within the current batch

        for idx in sorted_indices:

            # original_sample_length = self.dataset.get_source_len(idx)
            # if (
            #     original_sample_length < self.min_token_length
            #     or original_sample_length > self.max_token_length
            # ):  # Skip samples that exceed the max length
            #     continue

            # sample_length = 1 if self.batch_type == "example" else original_sample_length

            # Set sample_length based on the batch type
            if self.batch_type == "example":
                sample_length = 1
            elif self.batch_type == "token":
                sample_length = self.dataset.get_source_len(idx) + int(
                    self.dataset.get_target_len(idx) * 1.2
                )
            else:
                sample_length = self.dataset.get_source_len(idx)
            # Calculate potential batch size with the new sample
            potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)
            # Add index to batch if it doesn't exceed batch size limit
            if potential_batch_length <= self.batch_size:
                batch.append(idx)
                max_len_in_batch = max(max_len_in_batch, sample_length)
            else:
                # Save the current batch and start a new one
                buffer_batches.append(batch)
                batch = [idx]
                max_len_in_batch = sample_length

        # Add the last batch if it shouldn't be dropped
        if batch and (not self.drop_last or len(batch) * max_len_in_batch == self.batch_size):
            buffer_batches.append(batch)

        # Shuffle the list of batches
        if self.shuffle:
            random.seed(self.epoch)
            random.shuffle(buffer_batches)

        # Ensure each rank gets the same number of batches
        batches_per_rank = int(math.ceil(len(buffer_batches) / self.num_replicas))
        total_batches_needed = batches_per_rank * self.num_replicas
        extra_batches = total_batches_needed - len(buffer_batches)
        # Add extra batches by random selection, if needed
        buffer_batches += random.choices(buffer_batches, k=extra_batches)

        # Allocate the batches to the current rank
        start_idx = self.rank * batches_per_rank
        end_idx = start_idx + batches_per_rank
        rank_batches = buffer_batches[start_idx + self.start_step : end_idx]
        logging.info(
            f"rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num: {end_idx-start_idx}, batch_num_after_step: {len(rank_batches)}"
        )
        # Return an iterator over the batches for the current rank
        return iter(rank_batches)

    def __len__(self):
        # Calculate the number of batches per epoch for the current rank
        return 1

    def set_epoch(self, epoch):
        # Set the epoch for shuffling
        self.epoch = epoch
first commit for takway.ai 2024-05-18 15:50:56 +08:00			`import torch`
			`import numpy as np`
			`import logging`
			`import math`
			`import torch.distributed as dist`
			`from torch.utils.data import DistributedSampler`
			`from torch.utils.data import BatchSampler, Sampler`
			`import torch.distributed as dist`
			`import random`
			`from funasr.register import tables`


			`@tables.register("batch_sampler_classes", "EspnetStyleBatchSampler")`
			`def EspnetStyleBatchSampler_fn(dataset, **kwargs):`
			`dataloader_args = {}`

			`batch_sampler = EspnetStyleBatchSampler(dataset, **kwargs)`
			`dataloader_args["batch_sampler"] = batch_sampler`
			`dataloader_args["num_workers"] = kwargs.get("num_workers", 4)`
			`dataloader_args["pin_memory"] = kwargs.get("pin_memory", True)`

			`return dataloader_args`


			`import torch`
			`from torch.utils.data import Dataset, DistributedSampler`
			`import math`
			`import random`


			`class EspnetStyleBatchSampler(DistributedSampler):`
			`def __init__(`
			`self,`
			`dataset,`
			`batch_size,`
			`batch_type="token",`
			`rank=None,`
			`num_replicas=None,`
			`rank_split=False,`
			`shuffle=True,`
			`drop_last=False,`
			`is_training: bool = True,`
			`sort_size: int = 1024,`
			`start_step: int = 0,`
			`**kwargs,`
			`):`

			`try:`
			`rank = dist.get_rank()`
			`num_replicas = dist.get_world_size()`
			`except:`
			`rank = 0`
			`num_replicas = 1`
			`# if rank_split:`
			`# logging.info(f"Warning, rank_split: {rank_split}, batch and shuffle data in local rank")`
			`# rank = 0`
			`# num_replicas = 1`
			`self.rank = rank`
			`self.num_replicas = num_replicas`
			`self.dataset = dataset`
			`self.batch_size = batch_size`
			`self.batch_type = batch_type`
			`self.is_training = is_training`
			`self.shuffle = shuffle and is_training`
			`self.drop_last = drop_last`

			`self.total_size = len(self.dataset)`
			`self.num_samples = int(math.ceil(self.total_size / self.num_replicas))`
			`self.epoch = 0`
			`self.sort_size = sort_size * num_replicas`
			`self.max_token_length = kwargs.get("max_token_length", 2048)`
			`self.min_token_length = kwargs.get("min_token_length", 0)`
			`self.length_scale_source = kwargs.get("length_scale_source", 1.0)`
			`self.start_step = start_step`
			`if self.start_step > 0:`
			`logging.info(f"Warning, start_step > 0, dataloader start from step: {self.start_step}")`
			`# super().__init__(dataset, num_replicas=num_replicas, rank=rank,`
			`# shuffle=shuffle, drop_last=drop_last)`

			`def __iter__(self):`
			`if self.shuffle:`
			`g = torch.Generator()`
			`g.manual_seed(self.epoch)`
			`random.seed(self.epoch)`
			`indices = torch.randperm(len(self.dataset), generator=g).tolist()`
			`else:`
			`indices = list(range(len(self.dataset)))`

			`# Sort indices by sample length`
			`sorted_indices = sorted(indices, key=lambda idx: self.dataset.get_source_len(idx))`

			`# Organize batches based on 'length' or 'example'`
			`buffer_batches = []`
			`batch = []`
			`max_len_in_batch = 0 # Tracks the max sample length within the current batch`

			`for idx in sorted_indices:`

			`# original_sample_length = self.dataset.get_source_len(idx)`
			`# if (`
			`# original_sample_length < self.min_token_length`
			`# or original_sample_length > self.max_token_length`
			`# ): # Skip samples that exceed the max length`
			`# continue`

			`# sample_length = 1 if self.batch_type == "example" else original_sample_length`

			`# Set sample_length based on the batch type`
			`if self.batch_type == "example":`
			`sample_length = 1`
			`elif self.batch_type == "token":`
			`sample_length = self.dataset.get_source_len(idx) + int(`
			`self.dataset.get_target_len(idx) * 1.2`
			`)`
			`else:`
			`sample_length = self.dataset.get_source_len(idx)`
			`# Calculate potential batch size with the new sample`
			`potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)`
			`# Add index to batch if it doesn't exceed batch size limit`
			`if potential_batch_length <= self.batch_size:`
			`batch.append(idx)`
			`max_len_in_batch = max(max_len_in_batch, sample_length)`
			`else:`
			`# Save the current batch and start a new one`
			`buffer_batches.append(batch)`
			`batch = [idx]`
			`max_len_in_batch = sample_length`

			`# Add the last batch if it shouldn't be dropped`
			`if batch and (not self.drop_last or len(batch) * max_len_in_batch == self.batch_size):`
			`buffer_batches.append(batch)`

			`# Shuffle the list of batches`
			`if self.shuffle:`
			`random.seed(self.epoch)`
			`random.shuffle(buffer_batches)`

			`# Ensure each rank gets the same number of batches`
			`batches_per_rank = int(math.ceil(len(buffer_batches) / self.num_replicas))`
			`total_batches_needed = batches_per_rank * self.num_replicas`
			`extra_batches = total_batches_needed - len(buffer_batches)`
			`# Add extra batches by random selection, if needed`
			`buffer_batches += random.choices(buffer_batches, k=extra_batches)`

			`# Allocate the batches to the current rank`
			`start_idx = self.rank * batches_per_rank`
			`end_idx = start_idx + batches_per_rank`
			`rank_batches = buffer_batches[start_idx + self.start_step : end_idx]`
			`logging.info(`
			`f"rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num: {end_idx-start_idx}, batch_num_after_step: {len(rank_batches)}"`
			`)`
			`# Return an iterator over the batches for the current rank`
			`return iter(rank_batches)`

			`def __len__(self):`
			`# Calculate the number of batches per epoch for the current rank`
			`return 1`

			`def set_epoch(self, epoch):`
			`# Set the epoch for shuffling`
			`self.epoch = epoch`