documentModel.parseText = function(text, options) { _.defaults(options, { // this will set how tokens are merged together // with a value larger than 1, tokens will be merged // otherwise, the token list will be kept as is maxTokensToMerge: 1, // this will only return merged token (original tokens will be ignored) keepMergedOnly: false, // tokens that exists in the list will be removed // from final result and not merged ignoredList: [ 'của', 'là', 'và', 'có', 'đã', 'những', 'các', 'với', 'cũng', 'đó', 'như', 'nhiều', 'còn', 'mà', 'thế', 'đi', 'nhưng', 'nhất', 'theo', 'sẽ', 'đang', 'rất', 'hơn' ], // try to be smart or not? tryToBeSmart: 0 }); var tokenized = tokenizer.tokenize(text); var tokens = tokenized.tokens; text = tokenizer.normalize(text); // merge tokens to form new tokens if (options.maxTokensToMerge > 1) { var newTokens = documentModel._mergeTokens( text, // only send not ignored tokens _.difference(tokens, options.ignoredList), options.maxTokensToMerge ); if (options.keepMergedOnly) { // ignore original tokens // without the special ones options.ignoredList = options.ignoredList.concat(_.difference(tokens, tokenized.special)); } tokens = tokens.concat(newTokens); } if (options.tryToBeSmart) { tokens = documentModel._removeIrrelevantTokens(text, tokens, 50, options.ignoredList); // tokens = documentModel._removeSingleAppearTokens(tokens); // tokens = documentModel._removeCompoundTokens(tokens); } // filter out ignored tokens if (options.ignoredList.length > 0) { tokens = _.difference(tokens, options.ignoredList); } return tokens; };
_.each(tokens, function(token) { if (previous.length == 0) { // nothing to do yet } else { // start merging this token // with previous tokens for (var i = 0; i < previous.length; i++) { var tmp = previous.slice(i); tmp.push(token); tmp = tmp.join(' '); // check for the joined token // to make sure it exists in the text // 'exists' means there is only spaces (ASCII 32) // between the original tokens // also check to not mix number and non number if (text.indexOf(tmp) != -1 && !tokenizer.isMixedNumberAndNonNumber(tmp)) { newTokens.push(tmp); } } } // keep this token in the previous array // to merge it later previous.push(token); if (previous.length >= max) { // too many token in the previous array // remove the oldest one previous.shift(); } });