// decoder/decoder-wrappers.h // Copyright 2014 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #ifndef KALDI_DECODER_DECODER_WRAPPERS_H_ #define KALDI_DECODER_DECODER_WRAPPERS_H_ #include "itf/options-itf.h" #include "decoder/lattice-faster-decoder.h" #include "decoder/lattice-incremental-decoder.h" #include "decoder/lattice-simple-decoder.h" // This header contains declarations from various convenience functions that are called // from binary-level programs such as gmm-decode-faster.cc, gmm-align-compiled.cc, and // so on. namespace kaldi { struct AlignConfig { BaseFloat beam; BaseFloat retry_beam; bool careful; AlignConfig(): beam(200.0), retry_beam(0.0), careful(false) { } void Register(OptionsItf *opts) { opts->Register("beam", &beam, "Decoding beam used in alignment"); opts->Register("retry-beam", &retry_beam, "Decoding beam for second try at alignment"); opts->Register("careful", &careful, "If true, do 'careful' alignment, which is better at detecting " "alignment failure (involves loop to start of decoding graph)."); } }; /// AlignUtteranceWapper is a wrapper for alignment code used in training, that /// is called from many different binaries, e.g. gmm-align, gmm-align-compiled, /// sgmm-align, etc. The writers for alignments and words will only be written /// to if they are open. The num_done, num_error, num_retried, tot_like and /// frame_count pointers will (if non-NULL) be incremented or added to, not set, /// by this function. void AlignUtteranceWrapper( const AlignConfig &config, const std::string &utt, BaseFloat acoustic_scale, // affects scores written to scores_writer, if // present fst::VectorFst *fst, // non-const in case config.careful == // true, we add loop. DecodableInterface *decodable, // not const but is really an input. Int32VectorWriter *alignment_writer, BaseFloatWriter *scores_writer, int32 *num_done, int32 *num_error, int32 *num_retried, double *tot_like, int64 *frame_count, BaseFloatVectorWriter *per_frame_acwt_writer = NULL); /// This function modifies the decoding graph for what we call "careful /// alignment". The problem we are trying to solve is that if the decoding eats /// up the words in the graph too fast, it can get stuck at the end, and produce /// what looks like a valid alignment even though there was really a failure. /// So what we want to do is to introduce, after the final-states of the graph, /// a "blind alley" with no final-probs reachable, where the decoding can go to /// get lost. Our basic idea is to append the decoding-graph to itself using /// the fst Concat operation; but in order that there should be final-probs at the end of /// the first but not the second FST, we modify the right-hand argument to the /// Concat operation so that it has none of the original final-probs, and add /// a "pre-initial" state that is final. void ModifyGraphForCarefulAlignment( fst::VectorFst *fst); /// TODO template bool DecodeUtteranceLatticeIncremental( LatticeIncrementalDecoderTpl &decoder, // not const but is really an input. DecodableInterface &decodable, // not const but is really an input. const TransitionInformation &trans_model, const fst::SymbolTable *word_syms, std::string utt, double acoustic_scale, bool determinize, bool allow_partial, Int32VectorWriter *alignments_writer, Int32VectorWriter *words_writer, CompactLatticeWriter *compact_lattice_writer, LatticeWriter *lattice_writer, double *like_ptr); // puts utterance's likelihood in like_ptr on success. /// This function DecodeUtteranceLatticeFaster is used in several decoders, and /// we have moved it here. Note: this is really "binary-level" code as it /// involves table readers and writers; we've just put it here as there is no /// other obvious place to put it. If determinize == false, it writes to /// lattice_writer, else to compact_lattice_writer. The writers for /// alignments and words will only be written to if they are open. /// /// Caution: this will only link correctly if FST is either fst::Fst, /// or fst::GrammarFst, as the template function is defined in the .cc file and /// only instantiated for those two types. template bool DecodeUtteranceLatticeFaster( LatticeFasterDecoderTpl &decoder, // not const but is really an input. DecodableInterface &decodable, // not const but is really an input. const TransitionInformation &trans_model, const fst::SymbolTable *word_syms, std::string utt, double acoustic_scale, bool determinize, bool allow_partial, Int32VectorWriter *alignments_writer, Int32VectorWriter *words_writer, CompactLatticeWriter *compact_lattice_writer, LatticeWriter *lattice_writer, double *like_ptr); // puts utterance's likelihood in like_ptr on success. /// This class basically does the same job as the function /// DecodeUtteranceLatticeFaster, but in a way that allows us /// to build a multi-threaded command line program more easily. /// The main computation takes place in operator (), and the output /// happens in the destructor. class DecodeUtteranceLatticeFasterClass { public: // Initializer sets various variables. // NOTE: we "take ownership" of "decoder" and "decodable". These // are deleted by the destructor. On error, "num_err" is incremented. DecodeUtteranceLatticeFasterClass( LatticeFasterDecoder *decoder, DecodableInterface *decodable, const TransitionInformation &trans_model, const fst::SymbolTable *word_syms, const std::string &utt, BaseFloat acoustic_scale, bool determinize, bool allow_partial, Int32VectorWriter *alignments_writer, Int32VectorWriter *words_writer, CompactLatticeWriter *compact_lattice_writer, LatticeWriter *lattice_writer, double *like_sum, // on success, adds likelihood to this. int64 *frame_sum, // on success, adds #frames to this. int32 *num_done, // on success (including partial decode), increments this. int32 *num_err, // on failure, increments this. int32 *num_partial); // If partial decode (final-state not reached), increments this. void operator () (); // The decoding happens here. ~DecodeUtteranceLatticeFasterClass(); // Output happens here. private: // The following variables correspond to inputs: LatticeFasterDecoder *decoder_; DecodableInterface *decodable_; const TransitionInformation *trans_model_; const fst::SymbolTable *word_syms_; std::string utt_; BaseFloat acoustic_scale_; bool determinize_; bool allow_partial_; Int32VectorWriter *alignments_writer_; Int32VectorWriter *words_writer_; CompactLatticeWriter *compact_lattice_writer_; LatticeWriter *lattice_writer_; double *like_sum_; int64 *frame_sum_; int32 *num_done_; int32 *num_err_; int32 *num_partial_; // The following variables are stored by the computation. bool computed_; // operator () was called. bool success_; // decoding succeeded (possibly partial) bool partial_; // decoding was partial. CompactLattice *clat_; // Stored output, if determinize_ == true. Lattice *lat_; // Stored output, if determinize_ == false. }; // This function DecodeUtteranceLatticeSimple is used in several decoders, and // we have moved it here. Note: this is really "binary-level" code as it // involves table readers and writers; we've just put it here as there is no // other obvious place to put it. If determinize == false, it writes to // lattice_writer, else to compact_lattice_writer. The writers for // alignments and words will only be written to if they are open. bool DecodeUtteranceLatticeSimple( LatticeSimpleDecoder &decoder, // not const but is really an input. DecodableInterface &decodable, // not const but is really an input. const TransitionInformation &trans_model, const fst::SymbolTable *word_syms, std::string utt, double acoustic_scale, bool determinize, bool allow_partial, Int32VectorWriter *alignments_writer, Int32VectorWriter *words_writer, CompactLatticeWriter *compact_lattice_writer, LatticeWriter *lattice_writer, double *like_ptr); // puts utterance's likelihood in like_ptr on success. } // end namespace kaldi. #endif