FunASR/runtime/onnxruntime/third_party/kaldi/decoder/decodable-matrix.h

254 lines
9.5 KiB
C
Raw Permalink Normal View History

2024-05-18 15:50:56 +08:00
// decoder/decodable-matrix.h
// Copyright 2009-2011 Microsoft Corporation
// 2013 Johns Hopkins University (author: Daniel Povey)
// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.
#ifndef KALDI_DECODER_DECODABLE_MATRIX_H_
#define KALDI_DECODER_DECODABLE_MATRIX_H_
#include <vector>
#include "base/kaldi-common.h"
#include "itf/decodable-itf.h"
#include "itf/transition-information.h"
#include "matrix/kaldi-matrix.h"
namespace kaldi {
class DecodableMatrixScaledMapped: public DecodableInterface {
public:
// This constructor creates an object that will not delete "likes" when done.
DecodableMatrixScaledMapped(const TransitionInformation &tm,
const Matrix<BaseFloat> &likes,
BaseFloat scale): trans_model_(tm), likes_(&likes),
tid_to_pdf_(trans_model_.TransitionIdToPdfArray()),
scale_(scale), delete_likes_(false) {
if (likes.NumCols() != tm.NumPdfs())
KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
<< likes.NumCols() << " cols but transition-model has "
<< tm.NumPdfs() << " pdf-ids.";
}
// This constructor creates an object that will delete "likes"
// when done.
DecodableMatrixScaledMapped(const TransitionInformation &tm,
BaseFloat scale,
const Matrix<BaseFloat> *likes):
trans_model_(tm), likes_(likes),
tid_to_pdf_(trans_model_.TransitionIdToPdfArray()),
scale_(scale), delete_likes_(true) {
if (likes->NumCols() != tm.NumPdfs())
KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
<< likes->NumCols() << " cols but transition-model has "
<< tm.NumPdfs() << " pdf-ids.";
}
virtual int32 NumFramesReady() const { return likes_->NumRows(); }
virtual bool IsLastFrame(int32 frame) const {
KALDI_ASSERT(frame < NumFramesReady());
return (frame == NumFramesReady() - 1);
}
// Note, frames are numbered from zero.
virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
KALDI_PARANOID_ASSERT(tid >= 1 && tid < tid_to_pdf_.size());
return scale_ * (*likes_)(frame, tid_to_pdf_[tid]);
}
// Indices are one-based! This is for compatibility with OpenFst.
virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
virtual ~DecodableMatrixScaledMapped() {
if (delete_likes_) delete likes_;
}
private:
const TransitionInformation &trans_model_; // for tid to pdf mapping
const Matrix<BaseFloat> *likes_;
const std::vector<int32> &tid_to_pdf_;
BaseFloat scale_;
bool delete_likes_;
KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
};
/**
This is like DecodableMatrixScaledMapped, but it doesn't support an acoustic
scale, and it does support a frame offset, whereby you can state that the
first row of 'likes' is actually the n'th row of the matrix of available
log-likelihoods. It's useful if the neural net output comes in chunks for
different frame ranges.
Note: DecodableMatrixMappedOffset solves the same problem in a slightly
different way, where you use the same decodable object. This one, unlike
DecodableMatrixMappedOffset, is compatible with when the loglikes are in a
SubMatrix.
*/
class DecodableMatrixMapped: public DecodableInterface {
public:
// This constructor creates an object that will not delete "likes" when done.
// the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
// greater than one if this is not the first chunk of likelihoods.
DecodableMatrixMapped(const TransitionInformation &tm,
const MatrixBase<BaseFloat> &likes,
int32 frame_offset = 0);
// This constructor creates an object that will delete "likes"
// when done.
DecodableMatrixMapped(const TransitionInformation &tm,
const Matrix<BaseFloat> *likes,
int32 frame_offset = 0);
virtual int32 NumFramesReady() const;
virtual bool IsLastFrame(int32 frame) const;
virtual BaseFloat LogLikelihood(int32 frame, int32 tid);
// Note: these indices are 1-based.
virtual int32 NumIndices() const;
virtual ~DecodableMatrixMapped();
private:
const TransitionInformation &trans_model_; // for tid to pdf mapping
const std::vector<int32>& tid_to_pdf_;
const MatrixBase<BaseFloat> *likes_;
const Matrix<BaseFloat> *likes_to_delete_;
int32 frame_offset_;
// raw_data_ and stride_ are a kind of fast look-aside for 'likes_', to be
// used when KALDI_PARANOID is false.
const BaseFloat *raw_data_;
int32 stride_;
KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMapped);
};
/**
This decodable class returns log-likes stored in a matrix; it supports
repeatedly writing to the matrix and setting a time-offset representing the
frame-index of the first row of the matrix. It's intended for use in
multi-threaded decoding; mutex and semaphores are not included. External
code will call SetLoglikes() each time more log-likelihods are available.
If you try to access a log-likelihood that's no longer available because
the frame index is less than the current offset, it is of course an error.
See also DecodableMatrixMapped, which supports the same type of thing but
with a different interface where you are expected to re-construct the
object each time you want to decode.
*/
class DecodableMatrixMappedOffset: public DecodableInterface {
public:
DecodableMatrixMappedOffset(const TransitionInformation &tm):
trans_model_(tm), tid_to_pdf_(trans_model_.TransitionIdToPdfArray()),
frame_offset_(0), input_is_finished_(false) { }
// this is not part of the generic Decodable interface.
int32 FirstAvailableFrame() const { return frame_offset_; }
// Logically, this function appends 'loglikes' (interpreted as newly available
// frames) to the log-likelihoods stored in the class.
//
// This function is destructive of the input "loglikes" because it may
// under some circumstances do a shallow copy using Swap(). This function
// appends loglikes to any existing likelihoods you've previously supplied.
void AcceptLoglikes(Matrix<BaseFloat> *loglikes,
int32 frames_to_discard);
void InputIsFinished() { input_is_finished_ = true; }
virtual int32 NumFramesReady() const {
return loglikes_.NumRows() + frame_offset_;
}
virtual bool IsLastFrame(int32 frame) const {
KALDI_ASSERT(frame < NumFramesReady());
return (frame == NumFramesReady() - 1 && input_is_finished_);
}
virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
KALDI_PARANOID_ASSERT(tid >= 1 && tid < tid_to_pdf_.size());
int32 pdf_id = tid_to_pdf_[tid];
#ifdef KALDI_PARANOID
return loglikes_(frame - frame_offset_, pdf_id);
#else
// This does no checking, so will be faster.
return raw_data_[frame * stride_ + pdf_id];
#endif
}
virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
// nothing special to do in destructor.
virtual ~DecodableMatrixMappedOffset() { }
private:
const TransitionInformation &trans_model_; // for tid to pdf mapping
const std::vector<int32>& tid_to_pdf_;
Matrix<BaseFloat> loglikes_;
int32 frame_offset_;
bool input_is_finished_;
// 'raw_data_' and 'stride_' are intended as a fast look-aside which is an
// alternative to accessing data_. raw_data_ is a faked version of
// data_->Data() as if it started from frame zero rather than frame_offset_.
// This simplifies the code of LogLikelihood(), in cases where KALDI_PARANOID
// is not defined.
BaseFloat *raw_data_;
int32 stride_;
KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMappedOffset);
};
class DecodableMatrixScaled: public DecodableInterface {
public:
DecodableMatrixScaled(const Matrix<BaseFloat> &likes,
BaseFloat scale):
likes_(likes), scale_(scale) { }
virtual int32 NumFramesReady() const { return likes_.NumRows(); }
virtual bool IsLastFrame(int32 frame) const {
KALDI_ASSERT(frame < NumFramesReady());
return (frame == NumFramesReady() - 1);
}
// Note, frames are numbered from zero.
virtual BaseFloat LogLikelihood(int32 frame, int32 index) {
if (index > likes_.NumCols() || index <= 0 ||
frame < 0 || frame >= likes_.NumRows())
KALDI_ERR << "Invalid (frame, index - 1) = ("
<< frame << ", " << index - 1 << ") for matrix of size "
<< likes_.NumRows() << " x " << likes_.NumCols();
return scale_ * likes_(frame, index - 1);
}
// Indices are one-based! This is for compatibility with OpenFst.
virtual int32 NumIndices() const { return likes_.NumCols(); }
private:
const Matrix<BaseFloat> &likes_;
BaseFloat scale_;
KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaled);
};
} // namespace kaldi
#endif // KALDI_DECODER_DECODABLE_MATRIX_H_