59 #include "ngram_model_internal.h" 60 #include "ngram_model_trie.h" 67 ext = strrchr(file_name,
'.');
72 while (--ext >= file_name) {
76 if (ext < file_name) {
81 while (--ext >= file_name) {
85 if (ext < file_name) {
125 const char *file_name,
132 ngram_model_trie_read_bin(config, file_name,
136 ngram_model_trie_read_arpa(config, file_name,
140 ngram_model_trie_read_dmp(config, file_name,
146 model = ngram_model_trie_read_arpa(config, file_name, lmath);
150 ngram_model_trie_read_bin(config, file_name, lmath)) != NULL)
153 ngram_model_trie_read_dmp(config, file_name, lmath)) != NULL)
157 E_ERROR(
"language model file type not supported\n");
167 lw = cmd_ln_float32_r(config,
"-lw");
169 wip = cmd_ln_float32_r(config,
"-wip");
190 return ngram_model_trie_write_arpa(model, file_name);
192 return ngram_model_trie_write_bin(model, file_name);
194 E_ERROR(
"language model file type not supported\n");
197 E_ERROR(
"language model file type not supported\n");
204 logmath_t * lmath, int32 n, int32 n_unigram)
213 if (base->
lmath != lmath) {
225 for (i = 0; i < base->
n_words; ++i) {
232 n_unigram *
sizeof(
char *));
275 for (i = 0; i < model->
n_words; ++i) {
286 for (j = 0; j < lmclass->
n_words; ++j) {
289 for (j = 0; j < lmclass->
n_hash; ++j) {
290 if (lmclass->nword_hash[j].
wid != -1) {
297 ngram_class_free(model->
classes[i]);
321 for (i = 0; i < model->
n_words; ++i) {
330 if (outstr[0] ==
'<' || outstr[0] ==
'[') {
349 E_WARN(
"Duplicate word in dictionary after conversion: %s\n",
355 model->
wid = new_wid;
376 int32 n_hist, int32 * n_used)
378 int32 score, class_weight = 0;
386 if (NGRAM_IS_CLASSWID(wid)) {
389 class_weight = ngram_class_prob(lmclass, wid);
390 if (class_weight == 1)
394 for (i = 0; i < n_hist; ++i) {
396 && NGRAM_IS_CLASSWID(history[i]))
400 score = (*model->
funcs->
score) (model, wid, history, n_hist, n_used);
403 return score + class_weight;
416 va_start(history, word);
418 while ((hword = va_arg(history,
const char *)) != NULL)
423 va_start(history, word);
425 while ((hword = va_arg(history,
const char *)) != NULL) {
426 histid[n_hist] =
ngram_wid(model, hword);
432 histid, n_hist, &n_used);
455 int32 n_hist, int32 * n_used)
457 int32 prob, class_weight = 0;
465 if (NGRAM_IS_CLASSWID(wid)) {
468 class_weight = ngram_class_prob(lmclass, wid);
469 if (class_weight == 1)
473 for (i = 0; i < n_hist; ++i) {
475 && NGRAM_IS_CLASSWID(history[i]))
482 return prob + class_weight;
495 va_start(history, word);
497 while ((hword = va_arg(history,
const char *)) != NULL)
502 va_start(history, word);
504 while ((hword = va_arg(history,
const char *)) != NULL) {
505 histid[n_hist] =
ngram_wid(model, hword);
511 histid, n_hist, &n_used);
525 ctx_id = (int32 *)
ckd_calloc(n - 1,
sizeof(*ctx_id));
526 for (i = 1; i < (uint32) n; ++i)
527 ctx_id[i - 1] =
ngram_wid(model, words[i]);
544 prob = (int32) (prob / base->
lw);
599 wid = NGRAM_BASEWID(wid);
610 const char *word, int32 classid)
616 E_WARN(
"Omit duplicate word '%s'\n", word);
623 wid = NGRAM_CLASSWID(wid, classid);
640 (
"Hash insertion failed for word %s => %p (should not happen)\n",
650 const char *word, float32 weight)
656 E_WARN(
"Can't add word '%s' to read-only language model. " 657 "Disable mmap with '-mmap no' to make it writable\n", word);
661 wid = ngram_add_word_internal(model, word, -1);
677 ngram_class_new(
ngram_model_t * model, int32 tag_wid, int32 start_wid,
691 lmclass->nword_hash = NULL;
694 for (gn = classwords; gn; gn = gnode_next(gn)) {
695 tprob += gnode_float32(gn);
697 if (tprob > 1.1 || tprob < 0.9) {
698 E_INFO(
"Total class probability is %f, will normalize\n", tprob);
699 for (gn = classwords; gn; gn = gnode_next(gn)) {
700 gn->data.fl /= tprob;
703 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) {
711 ngram_class_add_word(
ngram_class_t * lmclass, int32 wid, int32 lweight)
715 if (lmclass->nword_hash == NULL) {
717 lmclass->nword_hash =
718 ckd_malloc(NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
719 memset(lmclass->nword_hash, 0xff,
720 NGRAM_HASH_SIZE *
sizeof(*lmclass->nword_hash));
721 lmclass->
n_hash = NGRAM_HASH_SIZE;
727 hash = wid & (lmclass->
n_hash - 1);
728 if (lmclass->nword_hash[hash].
wid == -1) {
730 lmclass->nword_hash[hash].
wid = wid;
731 lmclass->nword_hash[hash].
prob1 = lweight;
738 while (lmclass->nword_hash[hash].
next != -1)
739 hash = lmclass->nword_hash[hash].
next;
744 lmclass->nword_hash =
ckd_realloc(lmclass->nword_hash,
748 memset(lmclass->nword_hash + lmclass->
n_hash, 0xff,
749 lmclass->
n_hash *
sizeof(*lmclass->nword_hash));
756 for (next = 0; next < lmclass->
n_hash; ++next)
757 if (lmclass->nword_hash[next].
wid == -1)
760 assert(next != lmclass->
n_hash);
762 lmclass->nword_hash[next].
wid = wid;
763 lmclass->nword_hash[next].
prob1 = lweight;
764 lmclass->nword_hash[hash].
next = next;
780 const char *classname,
781 const char *word, float32 weight)
784 int32 classid, tag_wid, wid, i, scale;
792 E_ERROR(
"No such word or class tag: %s\n", classname);
795 for (classid = 0; classid < model->
n_classes; ++classid) {
802 (
"Word %s is not a class tag (call ngram_model_add_class() first)\n",
806 lmclass = model->
classes[classid];
809 wid = ngram_add_word_internal(model, word, classid);
819 for (i = 0; i < lmclass->
n_words; ++i)
820 lmclass->
prob1[i] += scale;
821 for (i = 0; i < lmclass->
n_hash; ++i)
822 if (lmclass->nword_hash[i].
wid != -1)
823 lmclass->nword_hash[i].
prob1 += scale;
826 return ngram_class_add_word(lmclass, wid,
832 const char *classname,
834 char **words,
const float32 * weights, int32 n_words)
838 int32 i, start_wid = -1;
839 int32 classid, tag_wid;
850 E_ERROR(
"Number of classes cannot exceed 128 (sorry)\n");
854 for (i = 0; i < n_words; ++i) {
857 wid = ngram_add_word_internal(model, words[i], classid);
861 start_wid = NGRAM_BASEWID(wid);
865 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords);
877 model->
classes[classid] = lmclass;
884 int32 base_wid = NGRAM_BASEWID(wid);
886 if (base_wid < lmclass->start_wid
891 hash = wid & (lmclass->
n_hash - 1);
892 while (hash != -1 && lmclass->nword_hash[hash].
wid != wid)
893 hash = lmclass->nword_hash[hash].
next;
896 return lmclass->nword_hash[hash].
prob1;
904 read_classdef_file(
hash_table_t * classes,
const char *file_name)
913 char *classname = NULL;
915 if ((fp =
fopen_comp(file_name,
"r", &is_pipe)) == NULL) {
916 E_ERROR(
"File %s not found\n", file_name);
926 if (fgets(line,
sizeof(line), fp) == NULL)
935 if (n_words == 2 && 0 == strcmp(wptr[0],
"END")) {
940 if (classname == NULL || 0 != strcmp(wptr[1], classname))
949 classdef->words =
ckd_calloc(classdef->n_words,
950 sizeof(*classdef->words));
951 classdef->weights =
ckd_calloc(classdef->n_words,
952 sizeof(*classdef->weights));
955 for (i = 0; i < classdef->n_words; ++i) {
957 classdef->weights[i] = gnode_float32(weight);
958 word = gnode_next(word);
959 weight = gnode_next(weight);
965 classdef_free(classdef);
991 if (n_words == 2 && 0 == strcmp(wptr[0],
"LMCLASS")) {
1005 for (gn = classwords; gn; gn = gnode_next(gn))
1018 for (i = 0; i < classdef->n_words; ++i)
1035 if (read_classdef_file(classes, file_name) < 0) {
1042 for (gn = hl; gn; gn = gnode_next(gn)) {
1049 classdef->n_words) < 0)
1055 for (gn = hl; gn; gn = gnode_next(gn)) {
1058 classdef_free(he->
val);
SPHINXBASE_EXPORT int32 ngram_probv(ngram_model_t *model, const char *word,...)
Get the "raw" log-probability for a general N-Gram.
SPHINXBASE_EXPORT int32 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick "raw" probability lookup for a general N-Gram.
struct ngram_funcs_s * funcs
Implementation-specific methods.
SPHINXBASE_EXPORT int32 hash_table_lookup_int32(hash_table_t *h, const char *key, int32 *val)
Look up a 32-bit integer value in a hash table.
int32 next
Index of next bucket (or -1 for no collision)
SPHINXBASE_EXPORT glist_t glist_add_float32(glist_t g, float32 val)
Create and prepend a new list node containing a single-precision float.
SPHINXBASE_EXPORT ngram_model_t * ngram_model_read(cmd_ln_t *config, const char *file_name, ngram_file_type_t file_type, logmath_t *lmath)
Read an N-Gram model from a file on disk.
Miscellaneous useful string functions.
#define E_INFO(...)
Print logging information to standard error stream.
SPHINXBASE_EXPORT int ngram_model_write(ngram_model_t *model, const char *file_name, ngram_file_type_t format)
Write an N-Gram model to disk.
SPHINXBASE_EXPORT int ngram_model_casefold(ngram_model_t *model, int kase)
Case-fold word strings in an N-Gram model.
#define ckd_calloc(n, sz)
Macros to simplify the use of above functions.
#define E_ERROR(...)
Print error message to error log.
SPHINXBASE_EXPORT int32 ngram_unknown_wid(ngram_model_t *model)
Get the unknown word ID for a language model.
#define hash_table_enter_int32(h, k, v)
Add a 32-bit integer value to a hash table.
hash_table_t * wid
Mapping of unigram names to word IDs.
char ** word_str
Unigram names.
Sphinx's memory allocation/deallocation routines.
SPHINXBASE_EXPORT int32 ngram_wid(ngram_model_t *model, const char *word)
Look up numerical word ID.
int32 n_hash
Number of buckets in nword_hash (power of 2)
SPHINXBASE_EXPORT glist_t hash_table_tolist(hash_table_t *h, int32 *count)
Build a glist of valid hash_entry_t pointers from the given hash table.
#define NGRAM_INVALID_WID
Impossible word ID.
SPHINXBASE_EXPORT int cmd_ln_exists_r(cmd_ln_t *cmdln, char const *name)
Re-entrant version of cmd_ln_exists().
File names related operation.
SPHINXBASE_EXPORT uint32 const * ngram_model_get_counts(ngram_model_t *model)
Get the counts of the various N-grams in the model.
SPHINXBASE_EXPORT int logmath_log(logmath_t *lmath, float64 p)
Convert linear floating point number to integer log in base B.
SPHINXBASE_EXPORT int32 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used)
Quick trigram score lookup.
A node in a generic list.
uint8 writable
Are word strings writable?
SPHINXBASE_EXPORT int ngram_model_free(ngram_model_t *model)
Release memory associated with an N-Gram model.
#define ckd_salloc(ptr)
Macro for ckd_salloc
SPHINXBASE_EXPORT hash_table_t * hash_table_new(int32 size, int32 casearg)
Allocate a new hash table for a given expected size.
SPHINXBASE_EXPORT void hash_table_empty(hash_table_t *h)
Delete all entries from a hash_table.
SPHINXBASE_EXPORT void ckd_free(void *ptr)
Test and free a 1-D array.
SPHINXBASE_EXPORT glist_t glist_add_ptr(glist_t g, void *ptr)
Create and prepend a new list node, with the given user-defined data, at the HEAD of the given generi...
int32 n_words
Number of base words for this class.
SPHINXBASE_EXPORT int32 strcmp_nocase(const char *str1, const char *str2)
(FIXME! The implementation is incorrect!) Case insensitive string compare.
int32 log_zero
Zero probability, cached here for quick lookup.
SPHINXBASE_EXPORT void hash_table_free(hash_table_t *h)
Free the specified hash table; the caller is responsible for freeing the key strings pointed to by th...
int refcount
Reference count.
SPHINXBASE_EXPORT int ngram_model_apply_weights(ngram_model_t *model, float32 lw, float32 wip)
Apply a language weight, insertion penalty, and unigram weight to a language model.
A note by ARCHAN at 20050510: Technically what we use is so-called "hash table with buckets" which is...
int32 n_1g_alloc
Number of allocated word strings (for new word addition)
SPHINXBASE_EXPORT double atof_c(char const *str)
Locale independent version of atof().
SPHINXBASE_EXPORT int32 strncmp_nocase(const char *str1, const char *str2, size_t len)
Like strcmp_nocase() but with a maximum length.
int32(* raw_score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying raw language model probability.
ARPABO text format (the standard).
SPHINXBASE_EXPORT glist_t glist_reverse(glist_t g)
Reverse the order of the given glist.
uint32 * n_counts
Counts for 1, 2, 3, ...
int32 tag_wid
Base word ID for this class tag.
SPHINXBASE_EXPORT void glist_free(glist_t g)
Free the given generic list; user-defined data contained within is not automatically freed...
SPHINXBASE_EXPORT ngram_model_t * ngram_model_retain(ngram_model_t *model)
Retain ownership of an N-Gram model.
SPHINXBASE_EXPORT int32 ngram_score(ngram_model_t *model, const char *word,...)
Get the score (scaled, interpolated log-probability) for a general N-Gram.
#define gnode_ptr(g)
Head of a list of gnodes.
SPHINXBASE_EXPORT int32 ngram_model_get_size(ngram_model_t *model)
Get the order of the N-gram model (i.e.
int32 n_hash_inuse
Number of words in nword_hash.
SPHINXBASE_EXPORT char const * ngram_type_to_str(int type)
Get the canonical name for an N-Gram file type.
SPHINXBASE_EXPORT int32 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used)
Quick bigram score lookup.
SPHINXBASE_EXPORT int32 ngram_model_add_word(ngram_model_t *model, const char *word, float32 weight)
Add a word (unigram) to the language model.
uint8 n
This is an n-gram model (1, 2, 3, ...).
Implementation of logging routines.
logmath_t * lmath
Log-math object.
SPHINXBASE_EXPORT int32 ngram_model_add_class(ngram_model_t *model, const char *classname, float32 classweight, char **words, const float32 *weights, int32 n_words)
Add a new class to a language model.
SPHINXBASE_EXPORT void * hash_table_enter(hash_table_t *h, const char *key, void *val)
Try to add a new entry with given key and associated value to hash table h.
SPHINXBASE_EXPORT FILE * fopen_comp(const char *file, const char *mode, int32 *ispipe)
Like fopen, but use popen and zcat if it is determined that "file" is compressed (i.e., has a .z, .Z, .gz, or .GZ extension).
One class definition from a classdef file.
#define E_WARN(...)
Print warning message to error log.
SPHINXBASE_EXPORT ngram_file_type_t ngram_str_to_type(const char *str_name)
Get the N-Gram file type from a string.
int32 start_wid
Starting base word ID for this class' words.
SPHINXBASE_EXPORT int32 ngram_model_read_classdef(ngram_model_t *model, const char *file_name)
Read a class definition file and add classes to a language model.
SPHINXBASE_EXPORT int logmath_get_zero(logmath_t *lmath)
Get the smallest possible value represented in this base.
SPHINXBASE_EXPORT void ngram_model_flush(ngram_model_t *lm)
Flush any cached N-Gram information.
uint8 n_classes
Number of classes (maximum 128)
SPHINXBASE_EXPORT int32 str2words(char *line, char **wptr, int32 n_wptr)
Convert a line to an array of "words", based on whitespace separators.
Opaque structure used to hold the results of command-line parsing.
#define ckd_malloc(sz)
Macro for ckd_malloc
enum ngram_file_type_e ngram_file_type_t
File types for N-Gram files.
Implementation-specific functions for operating on ngram_model_t objects.
SPHINXBASE_EXPORT float32 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip)
Get the current weights from a language model.
SPHINXBASE_EXPORT ngram_file_type_t ngram_file_name_to_type(const char *file_name)
Guess the file type for an N-Gram model from the filename.
float32 lw
Language model scaling factor.
Implementation of ngram_class_t.
SPHINXBASE_EXPORT void lcase(char *str)
Convert str to all lower case.
int32 prob1
Probability for this word.
SPHINXBASE_EXPORT void ucase(char *str)
Convert str to all upper case.
Common implementation of ngram_model_t.
void * val
Key-length; the key string does not have to be a C-style NULL terminated string; it can have arbitrar...
void(* free)(ngram_model_t *model)
Implementation-specific function for freeing an ngram_model_t.
int32 wid
Word ID of this bucket.
SPHINXBASE_EXPORT int32 ngram_score_to_prob(ngram_model_t *model, int32 score)
Convert score to "raw" log-probability.
void(* flush)(ngram_model_t *model)
Implementation-specific function for purging N-Gram cache.
SPHINXBASE_EXPORT const char * ngram_word(ngram_model_t *model, int32 wid)
Look up word string for numerical word ID.
SPHINXBASE_EXPORT void fclose_comp(FILE *fp, int32 ispipe)
Close a file opened using fopen_comp.
SPHINXBASE_EXPORT int32 ngram_zero(ngram_model_t *model)
Get the "zero" log-probability value for a language model.
Fast integer logarithmic addition operations.
struct ngram_class_s ** classes
Word class definitions.
#define ckd_realloc(ptr, sz)
Macro for ckd_realloc
SPHINXBASE_EXPORT int32 ngram_model_add_class_word(ngram_model_t *model, const char *classname, const char *word, float32 weight)
Add a word to a class in a language model.
SPHINXBASE_EXPORT int32 ngram_prob(ngram_model_t *model, const char *const *words, int32 n)
Get the "raw" log-probability for a general N-Gram.
SPHINXBASE_EXPORT int32 glist_count(glist_t g)
Count the number of element in a given link list.
Determine file type automatically.
SPHINXBASE_EXPORT int32 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Quick general N-Gram score lookup.
int32(* score)(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist, int32 *n_used)
Implementation-specific function for querying language model score.
file IO related operations.
Locale-independent implementation of case swapping operation.
int32 * prob1
Probability table for base words.
int32 log_wip
Log of word insertion penalty.
int32(* add_ug)(ngram_model_t *model, int32 wid, int32 lweight)
Implementation-specific function for adding unigrams.
int(* apply_weights)(ngram_model_t *model, float32 lw, float32 wip)
Implementation-specific function for applying language model weights.
int32 n_words
Number of actual word strings (NOT the same as the number of unigrams, due to class words)...