_.each(options.ngrams, function(ngram){ var keywordsForNgram; var tf = new Tf(); if (options.alternativeTokenizer === true) { natural.NGrams.setTokenizer(new natural.RegexpTokenizer({pattern: /\b[^\s]+\b/g, gaps: false})); } else if (options.alternativeTokenizer) { natural.NGrams.setTokenizer(new natural.RegexpTokenizer({pattern: new RegExp(options.alternativeTokenizer.source, "g"), gaps: false})); } var tokenized = _.map(natural.NGrams.ngrams(text, ngram), function(ngram){ if (options.stem){ ngram = _.map(ngram, stem); } return ngram.join(' ').toLowerCase(); }); tf.addDocument(tokenized); keywordsForNgram = tf.listMostFrequestTerms(0); keywordsForNgram = _.select(keywordsForNgram, function(item){ return usePhrase(item.term, options); }); results = results.concat(keywordsForNgram); });
Trigrams.prototype.tokenize = function (str) { var tokens = []; var grams = natural.NGrams.trigrams(str); grams.forEach(function(set, i) { tokens[i] = ""; set.forEach(function(el, j) { tokens[i] += el + ' '; }); tokens[i].trim(); }); return tokens; };
_.each(_.keys(phrases), function(phrase){ var ngramToTry, subPhrases; ngramToTry = phrase.split(' ').length - 1; if (ngramToTry < 1) return; _.each(natural.NGrams.ngrams(phrase, ngramToTry), function(ngram){ var subPhrase = ngram.join(' '); if (phrases[subPhrase]){ if (!cutoff || (phrases[phrase] / phrases[subPhrase]) >= (1 - cutoff)){ delete combined[subPhrase]; } } }); });