JavaScript vietntl Tokenizer Examples

Programming Language: JavaScript

Namespace/Package Name: vietntl

Class/Type: Tokenizer

Examples at hotexamples.com: 2

JavaScript Tokenizer - 2 examples found. These are the top rated real world JavaScript examples of vietntl.Tokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

isMixedNumberAndNonNumber(1)

normalize(1)

tokenize(1)

Example #1

Show file

File: document.js Project: daohoangson/koluto-platform

documentModel.parseText = function(text, options) {
    _.defaults(options, {
         // this will set how tokens are merged together
         // with a value larger than 1, tokens will be merged
         // otherwise, the token list will be kept as is
        maxTokensToMerge: 1,
        
        // this will only return merged token (original tokens will be ignored)
        keepMergedOnly: false,
        
        // tokens that exists in the list will be removed 
        // from final result and not merged
        ignoredList: [
            'của', 'là', 'và', 'có', 'đã',
            'những', 'các', 'với',
            'cũng', 'đó', 'như', 'nhiều',
            'còn', 'mà', 'thế', 'đi', 'nhưng',
            'nhất', 'theo', 'sẽ',
            'đang', 'rất', 'hơn'
        ],
        
        // try to be smart or not?
        tryToBeSmart: 0
    });

    var tokenized = tokenizer.tokenize(text);
    var tokens = tokenized.tokens;
    text = tokenizer.normalize(text);

    // merge tokens to form new tokens
    if (options.maxTokensToMerge > 1) {
        var newTokens = documentModel._mergeTokens(
            text,
            // only send not ignored tokens
            _.difference(tokens, options.ignoredList),
            options.maxTokensToMerge
        );
        
        if (options.keepMergedOnly) {
            // ignore original tokens
            // without the special ones
            options.ignoredList = options.ignoredList.concat(_.difference(tokens, tokenized.special));
        }
        
        tokens = tokens.concat(newTokens);
    }
    
    if (options.tryToBeSmart) {
        tokens = documentModel._removeIrrelevantTokens(text, tokens, 50, options.ignoredList);
        // tokens = documentModel._removeSingleAppearTokens(tokens);
        // tokens = documentModel._removeCompoundTokens(tokens);
    }

    // filter out ignored tokens
    if (options.ignoredList.length > 0) {
        tokens = _.difference(tokens, options.ignoredList);
    }
    
    return tokens;
};

Example #2

Show file

File: document.js Project: daohoangson/koluto-platform

 _.each(tokens, function(token) {
     if (previous.length == 0) {
         // nothing to do yet
     } else {
         // start merging this token
         // with previous tokens
         for (var i = 0; i < previous.length; i++) {
             var tmp = previous.slice(i);
             tmp.push(token);
             tmp = tmp.join(' ');
             
             // check for the joined token
             // to make sure it exists in the text
             // 'exists' means there is only spaces (ASCII 32)
             // between the original tokens
             // also check to not mix number and non number
             if (text.indexOf(tmp) != -1 && !tokenizer.isMixedNumberAndNonNumber(tmp)) {
                 newTokens.push(tmp);
             }
         }
     }
     
     // keep this token in the previous array
     // to merge it later
     previous.push(token);
     
     if (previous.length >= max) {
         // too many token in the previous array
         // remove the oldest one
         previous.shift();
     }
 });