Example #1
0
documentModel.parseText = function(text, options) {
    _.defaults(options, {
         // this will set how tokens are merged together
         // with a value larger than 1, tokens will be merged
         // otherwise, the token list will be kept as is
        maxTokensToMerge: 1,
        
        // this will only return merged token (original tokens will be ignored)
        keepMergedOnly: false,
        
        // tokens that exists in the list will be removed 
        // from final result and not merged
        ignoredList: [
            'của', 'là', 'và', 'có', 'đã',
            'những', 'các', 'với',
            'cũng', 'đó', 'như', 'nhiều',
            'còn', 'mà', 'thế', 'đi', 'nhưng',
            'nhất', 'theo', 'sẽ',
            'đang', 'rất', 'hơn'
        ],
        
        // try to be smart or not?
        tryToBeSmart: 0
    });

    var tokenized = tokenizer.tokenize(text);
    var tokens = tokenized.tokens;
    text = tokenizer.normalize(text);

    // merge tokens to form new tokens
    if (options.maxTokensToMerge > 1) {
        var newTokens = documentModel._mergeTokens(
            text,
            // only send not ignored tokens
            _.difference(tokens, options.ignoredList),
            options.maxTokensToMerge
        );
        
        if (options.keepMergedOnly) {
            // ignore original tokens
            // without the special ones
            options.ignoredList = options.ignoredList.concat(_.difference(tokens, tokenized.special));
        }
        
        tokens = tokens.concat(newTokens);
    }
    
    if (options.tryToBeSmart) {
        tokens = documentModel._removeIrrelevantTokens(text, tokens, 50, options.ignoredList);
        // tokens = documentModel._removeSingleAppearTokens(tokens);
        // tokens = documentModel._removeCompoundTokens(tokens);
    }

    // filter out ignored tokens
    if (options.ignoredList.length > 0) {
        tokens = _.difference(tokens, options.ignoredList);
    }
    
    return tokens;
};
Example #2
0
 _.each(tokens, function(token) {
     if (previous.length == 0) {
         // nothing to do yet
     } else {
         // start merging this token
         // with previous tokens
         for (var i = 0; i < previous.length; i++) {
             var tmp = previous.slice(i);
             tmp.push(token);
             tmp = tmp.join(' ');
             
             // check for the joined token
             // to make sure it exists in the text
             // 'exists' means there is only spaces (ASCII 32)
             // between the original tokens
             // also check to not mix number and non number
             if (text.indexOf(tmp) != -1 && !tokenizer.isMixedNumberAndNonNumber(tmp)) {
                 newTokens.push(tmp);
             }
         }
     }
     
     // keep this token in the previous array
     // to merge it later
     previous.push(token);
     
     if (previous.length >= max) {
         // too many token in the previous array
         // remove the oldest one
         previous.shift();
     }
 });