51 #if defined(_WIN32) && !defined(GNUWINCE) 52 #define srand48(x) srand(x) 53 #define lrand48() rand() 60 #include <sphinxbase/sphinxbase_export.h> 63 #include <sphinxbase/fixpoint.h> 73 #ifdef WORDS_BIGENDIAN 74 #define NATIVE_ENDIAN "big" 76 #define NATIVE_ENDIAN "little" 80 #define DEFAULT_SAMPLING_RATE 16000 82 #define DEFAULT_FRAME_RATE 100 85 #define DEFAULT_FRAME_SHIFT 160 87 #define DEFAULT_WINDOW_LENGTH 0.025625 89 #define DEFAULT_FFT_SIZE 512 91 #define DEFAULT_NUM_CEPSTRA 13 93 #define DEFAULT_NUM_FILTERS 40 96 #define DEFAULT_PRE_SPEECH 20 98 #define DEFAULT_POST_SPEECH 50 100 #define DEFAULT_START_SPEECH 10 103 #define DEFAULT_LOWER_FILT_FREQ 133.33334 105 #define DEFAULT_UPPER_FILT_FREQ 6855.4976 107 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97 109 #define DEFAULT_WARP_TYPE "inverse_linear" 113 #define waveform_to_cepstral_command_line_macro() \ 117 "Write out logspectral files instead of cepstra" }, \ 122 "Write out cepstral-smoothed logspectral files" }, \ 127 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \ 131 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \ 132 "Preemphasis parameter" }, \ 136 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \ 141 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \ 146 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \ 147 "Hamming window length" }, \ 151 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \ 156 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \ 157 "Number of filter banks" }, \ 161 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \ 162 "Lower edge of filters" }, \ 166 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \ 167 "Upper edge of filters" }, \ 172 "Normalize mel filters to unit area" }, \ 174 { "-round_filters", \ 177 "Round mel filter frequencies to DFT points" }, \ 181 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \ 182 "Number of cep coefficients" }, \ 187 "Use double bandwidth filters (same center freq)" }, \ 192 "Length of sin-curve for liftering, or 0 for no liftering." }, \ 194 { "-vad_prespeech", \ 196 ARG_STRINGIFY(DEFAULT_PRE_SPEECH), \ 197 "Num of speech frames to keep before silence to speech." }, \ 199 { "-vad_startspeech", \ 201 ARG_STRINGIFY(DEFAULT_START_SPEECH), \ 202 "Num of speech frames to trigger vad from silence to speech." }, \ 204 { "-vad_postspeech", \ 206 ARG_STRINGIFY(DEFAULT_POST_SPEECH), \ 207 "Num of silence frames to keep after from speech to silence." }, \ 209 { "-vad_threshold", \ 212 "Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level." }, \ 217 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \ 222 "Warping function type (or shape)" }, \ 227 "Parameters defining the warping function" }, \ 232 "Add 1/2-bit noise" }, \ 236 ARG_STRINGIFY(SEED), \ 237 "Seed for random number generator; if less than zero, pick our own" }, \ 242 "Remove DC offset from each frame" }, \ 247 "Remove noise with spectral subtraction in mel-energies" }, \ 249 { "-remove_silence", \ 252 "Enables VAD, removes silence frames from processing" }, \ 257 "Show input filenames" } \ 262 typedef fixed32 mfcc_t;
265 #define FLOAT2MFCC(x) FLOAT2FIX(x) 267 #define MFCC2FLOAT(x) FIX2FLOAT(x) 269 #define MFCCMUL(a,b) FIXMUL(a,b) 270 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out) 274 typedef float32 mfcc_t;
276 #define FLOAT2MFCC(x) (x) 278 #define MFCC2FLOAT(x) (x) 280 #define MFCCMUL(a,b) ((a)*(b)) 281 #define MFCCLN(x,in,out) log(x) 294 FE_OUTPUT_FILE_SUCCESS = 0,
295 FE_CONTROL_FILE_ERROR = -1,
297 FE_UNKNOWN_SINGLE_OR_BATCH = -3,
298 FE_INPUT_FILE_OPEN_ERROR = -4,
299 FE_INPUT_FILE_READ_ERROR = -5,
300 FE_MEM_ALLOC_ERROR = -6,
301 FE_OUTPUT_FILE_WRITE_ERROR = -7,
302 FE_OUTPUT_FILE_OPEN_ERROR = -8,
303 FE_ZERO_ENERGY_ERROR = -9,
304 FE_INVALID_PARAM_ERROR = -10
315 fe_t* fe_init_auto(
void);
325 arg_t const *fe_get_args(
void);
354 void fe_start_stream(
fe_t *fe);
361 int fe_start_utt(
fe_t *fe);
377 int fe_get_output_size(
fe_t *fe);
393 void fe_get_input_size(
fe_t *fe,
int *out_frame_shift,
394 int *out_frame_size);
402 uint8 fe_get_vad_state(
fe_t *fe);
419 int fe_end_utt(
fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes);
437 int fe_free(
fe_t *fe);
453 int fe_process_frames_ext(
fe_t *fe,
454 int16
const **inout_spch,
455 size_t *inout_nsamps,
457 int32 *inout_nframes,
459 int32 *voiced_spch_nsamps,
460 int32 *out_frameidx);
512 int fe_process_frames(
fe_t *fe,
513 int16
const **inout_spch,
514 size_t *inout_nsamps,
516 int32 *inout_nframes,
517 int32 *out_frameidx);
535 int fe_process_utt(
fe_t *fe,
546 void fe_free_2d(
void *arr);
552 int fe_mfcc_to_float(
fe_t *fe,
561 int fe_float_to_mfcc(
fe_t *fe,
590 int fe_logspec_to_mfcc(
fe_t *fe,
591 const mfcc_t *fr_spec,
604 int fe_logspec_dct2(
fe_t *fe,
605 const mfcc_t *fr_spec,
618 int fe_mfcc_dct3(
fe_t *fe,
619 const mfcc_t *fr_cep,
Command-line and other configurationparsing and handling.
Argument definition structure.
Opaque structure used to hold the results of command-line parsing.
Structure for the front-end computation.