function processText(text) { tokenizer=new natural.WordTokenizer() tokens=tokenizer.tokenize(text) return tokens }
function similarity(s1, s2) { var w1s = tokenizer.tokenize(s1); var w2s = tokenizer.tokenize(s2); var is = _.intersection(w1s, w2s); var commoncount = is.length; if (commoncount === 0) return 0; var res = (commoncount / ( log10(w1s.length) + log10(w2s.length) ) ); if (isNaN(res)){ throw 'NaN'; } return res; }
module.exports = function summarize(html) { var data = unfluff(html); var tokens = tokenizer.tokenize(data.text).length; var wordcounts = glossary.extract(data.title + '. ' + data.text); var keywords = _.chain(wordcounts) .filter(function(wc) { return wc.norm.length > 2 }) .sortBy('count') .reverse() .pluck('norm') .value(); if (tokens > 5 && keywords.length > 2) { var sent = sentiment(data.text); var stats = new Stats(data.text); var difficulty = stats.smogIndex() / 12; var wpm = (200 - 100 * difficulty) || 1; var minutes = Math.ceil(tokens / wpm); return _.extend({}, DEFAULTS, { ok: true, sentiment: sent.comparative, title: data.title, topics: keywords, words: tokens, difficulty: difficulty, minutes: minutes, image: data.image }); } return _.extend({}, DEFAULTS); };
.on('line', (tweet) => { onData(tweet); let prevLength = tweets.length, tempTweet = JSON.parse(tweet); tweets = _.union(tweets, [tempTweet.d]); //union doesn't adds something that has already been added let newLength = tweets.length; if(prevLength < newLength) { //which is what we want to start processing tempTweet.d = _.replace(tempTweet.d, new RegExp(regexes.join('|'), "gi"),''); let tempTokens = tokenizer.tokenize( emojiStrip( utf8.encode(tempTweet.d.toLowerCase()) ) ), tempIntersection = _.intersection(tempTokens, slanKeys); if(tempIntersection.length > 0) { tempTweet.d = tempTokens.join(" "); for (let i = tempIntersection.length - 1; i >= 0; i--) { tempTweet.d = tempTweet.d.replace( tempIntersection[i], slan[tempIntersection[i]] ) } tempTokens = tempTweet.d.split(" "); } tokenWriteStream.write(JSON.stringify({tok: _.difference(tempTokens, stopWords), t: tempTweet.t}) + "\n"); } })
var getTags = function(msg, cb){ tags = []; var a = tokenizer.tokenize(msg); wpos.getPOS(a, function(results){ results = _.omit(results, 'rest'); _.each(results, function(result){ if(result.length > 0){ var words={}; for (var i = result.length - 1; i >= 0; i--) { if(!words.hasOwnProperty(result[i])){ words[result[i]] = { tag: result[i], count: 1 } } else { words[result[i]].count += 1; } }; var m = _.max(words, function(noun){ return noun.count; }).tag; if(!_.contains(tags, m)){ tags.push(m); console.log("push!"); } } }); console.log(tags+" The tags!"); return cb(tags); //array of tokenized tags }); }
var interpreter = function(data) { // if it is a zip, assign to user if(parseInt(data.content)){ data.user.zipCode = parseInt(data.content) } // split into array sentence = tokenizer.tokenize(data.content) // add language tags (noun, verb, etc.) var taggedSentence = tagger.tag(sentence) // Sort out nouns of interests (things to be donated/disposed) var itemsToDispose = grabThings(taggedSentence) if (data.user.zipCode){ var response = locationToResponse(locations[0], itemsToDispose.join(',')) // var response = "Here is a link where you can find information about where to get rid of your " + itemsToDispose.join(',') + " : http://www.recycleworks.org/"; data.user.itemsToDispose = null return response } else { return 'what is your zipcode?' } };
function getLocationConfidence(text, searchLocation) { var matchingCities = _.map(locations, function (subLocations, key) { // console.log(subLocations); return _.map(subLocations, function (cities) { // console.log(cities, searchLocation); var normalizedCities = cities.map(function (city) { return city.toLowerCase(); }); return _.includes(normalizedCities, searchLocation.toLowerCase()) ? normalizedCities : null; }); }); var cityList = _.compact(_.flattenDeep(matchingCities)); var allPhrases = createTokens(text); var matchingPhrase = _.intersection(allPhrases, cityList); var textTokenized = tokenizer.tokenize(text); var locationTokenized = tokenizer.tokenize(matchingPhrase[0]); return locationTokenized.length / textTokenized.length; }
// prepares keywords to use in findDataMap (where the grouping categories are important) prepareAllKeywordsMap(stopKW) { let startMap = require('../resources/' + this.locale + '/strings/commands-keywords'); let resultMap = {}; for (let group in startMap) { let groupMap = {}; let startMapGroup = startMap[group]; for (let cat in startMapGroup) { let catList = []; let startCatList = startMapGroup[cat]; for (let k in startCatList) { let cleaned = this.prepareClean(startCatList[k]).toLowerCase(); let tokenized = tokenizeOnly ? tokenizer.tokenize(cleaned) : cleaned.tokenizeAndStem(true); catList.push({ key: tokenized.join(' '), span: tokenized.length }); stopKW.push(tokenized.join(' ')); } groupMap[cat] = catList; } resultMap[group] = groupMap; } return resultMap; }
// returns ALL matches in 'phrase' to keywords listed in 'opts' // returns a map where keys are the known keyword, and values are a list of matches, as in: // subject: { // 'change diaper': [ 'change diaper/change diaper(0/3)' ], // diapers: [ 'diapers/diaper(1/2)' ], // poop: [ 'poop/poop(0/1)', 'poop/poop(0/1)', 'poop/poop(0/1)' ] // } findManyDataMap(phrase, opts, threshold) { let cleanPhrase = this.prepareClean(phrase); let tokens = tokenizeOnly ? tokenizer.tokenize(cleanPhrase) : cleanPhrase.tokenizeAndStem(true); let found = {}; for (let i in tokens) { for (let o in opts) { let opt = opts[o]; let thres = (opt.key.length > 4) ? Math.max(Math.floor(opt.key.length * threshold),1) : 0; let sourceStr = tokens.slice(parseInt(i), parseInt(i) + opt.span).join(' '); let targetStr = opt.key; let lev = new Levenshtein(sourceStr, targetStr); if (debug) { console.log(tokens + ', opt: ' + targetStr + ', src: ' + sourceStr + ', thres: ' + thres + ', dist: ' + lev.distance); } if (lev.distance <= thres) { if (!found[targetStr]) { found[targetStr] = []; } let prevMatches = found[targetStr]; prevMatches.push(targetStr + '/' + sourceStr + '(' + lev.distance + '/' + thres + ')'); } } } return (Object.keys(found).length > 0) ? found : null; }
o.forEach(function(x) { var tags = ($N.objTags(x)||[]).map(function(t) { return '#' + t; }); var desc = $N.objDescription(x); if (desc!==undefined) { $ = cheerio.load(desc); desc = $.text(); } else desc = ''; var text = (x.name||'') + ' ' + desc; var terms = tokenizer.tokenize(text.trim()).map(function(w) { return w.toLowerCase(); }); terms = _.difference(terms, sw); var tokens = _.union(tags,terms); if (tokens.length > 0) lda.addDocument(x.id, tokens); });
reply.forEach(function(tweet){ var t = tweet.text; var no_at = t.replace(/@\w+/g, ''); //remove '@username' var no_space = no_at.replace(/^\s+|\s+$/g, '').toLowerCase();; //remove leading left or right spaces var tokenize_string = tokenizer.tokenize(no_space); wstream.write(tokenize_string.toString() + "\n"); });
_.each(wordList, function(sentence) { var words = tokenizer.tokenize(sentence); _.each(words, function(word) { if(word.length > 2) { self.addWord(word); } }); });
function prepText(text) { if (_.isArray(text)) return text; var deduped = _.uniq(tokenizer.tokenize(text)); if (!this.options.stopwords) return deduped; return _.reject(deduped, _.bind(isStopword, null, _.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords )); }
function createTokens(text) { var tokenized = tokenizer.tokenize(text); var biGrams = NGrams.ngrams(tokenized, 2).map(function (words) { return words.join(' '); }); var triGrams = NGrams.ngrams(tokenized, 3).map(function (words) { return words.join(' '); }); var ngrams = _.flatten(tokenized.concat(biGrams, triGrams)); var allPhrases = ngrams.map(function (phrase) { return phrase.toLowerCase(); }); return allPhrases; }
function get_other_account(query){ tokenizer = new natural.WordTokenizer(); tokens = tokenizer.tokenize(query); other_account = ''; for (i=0;i<tokens.length;i++){ token = tokens[i]; if (token == "from"){ if (i>=1 && (tokens[i-1] == "payments") || (tokens[i-1] == "Payments")){ if (i+1 < tokens.length){ other_account = tokens[i+1]; return other_account; } } } } }
function matchRE(re, text) { var wordArray = tokenizer.tokenize(text); for (var i = 0; i < wordArray.length; i++) { if (re.test(wordArray[i])) { return true; } } return false; }
function strToDoc(str){ var arr = (''+str).toLowerCase(); arr = arr.replace(/ä/g, 'a'); arr = arr.replace(/ö/g, 'o'); arr = arr.replace(/ü/g, 'u'); arr = tokenizer.tokenize(arr) arr = stemmer.stemm(arr); return arr; }
describe('Brill\'s POS Tagger', function() { var brill_pos_tagger; it('should initialise correctly with tagging rules for English', function(done) { brill_pos_tagger = new Brill_POS_Tagger(en_lexicon_file, en_rules_file, 'NN', function(error) { done(); }); }); var sentences; it('should correctly read a NYT article about Picasso', function(done) { fs.readFile(en_ex_nyt_article, 'utf8', function (error, text) { sentences = text.split('\n'); done(); }); }); var posjs_results; it('should correctly read tag results of pos-js for the NYT article', function(done) { fs.readFile(en_ex_nyt_article_expected_tag_results, 'utf8', function (error, text) { posjs_results = JSON.parse(text); done(); }); }); var tokenizer = new natural.WordTokenizer(); it('should process the article just like the old pos module', function() { sentences.forEach(function(sentence, index) { var tokenized_sentence = tokenizer.tokenize(sentence); var taggedWords = brill_pos_tagger.tag(tokenized_sentence); expect(taggedWords).toEqual(posjs_results[index]); }); }); it('should initialise correctly with tagging rules for Dutch', function(done) { brill_pos_tagger = new Brill_POS_Tagger(du_lexicon_file, du_rules_file, 'N', function(error) { done(); }); }); it('should correctly read a Volkskrant article about the ECB', function(done) { fs.readFile(du_ex_volkskrant_article, 'utf8', function (error, text) { sentences = text.split('\n'); done(); }); }); it('should process the Volkskrant article', function() { sentences.forEach(function(sentence, index) { var tokenized_sentence = tokenizer.tokenize(sentence); var taggedWords = brill_pos_tagger.tag(tokenized_sentence); expect(tokenized_sentence.length).toEqual(taggedWords.length); console.log(taggedWords); }); }); });
app.get("/api/1/messages/natural", function (req, res) { console.log("natural language processing with"+req.query.q); var natural = require('natural'); var sentiment=require('sentiment'); var nltk={}; tokenizer = new natural.WordTokenizer(); nltk.token=tokenizer.tokenize(req.query.q); natural.LancasterStemmer.attach(); nltk.stem=req.query.q.tokenizeAndStem(); var resp=sentiment(req.query.q); var sentimentValue="NEUTRAL"; if(resp.score > 0){ sentimentValue="POSITIVE"; }else if(resp.score < 0){ sentimentValue="NEGATIVE"; } nltk.sentiment=sentimentValue; console.log(nltk) res.send(nltk); });
var filteredResponses = _.filter(responses, function(response) { var tokens = tokenizer.tokenize(response.message) // If the origin and destination match and it's not a 'looking' post, // add it to the existing matches if (isMatch(origin, destination, tokens) && !Util.stringMatch(tokens, 'looking')) { return true } else { return false } })
this.addToDataset = function (text, next) { let wordArr = tokenizer.tokenize(text); tagWords(wordArr, (err, resp) => { if (err) { return next(err); } let POSArr = JSON.parse(resp); updateDataset(POSArr, next); }); };
/** * Runs all the plugins that power Chevy's decision making process * in a promise chain * @param {[object]} context [conversation context] * @return {[null]} */ think(context) { // tokenize the query context.queryTokens = this.tokenizer.tokenize(context.query) return conversation(context) .then(function(context) { return search(context) }) .then(function(context) { return action(context) }) }
request("https://hacker-news.firebaseio.com/v0/item/" + articleId + ".json?print=pretty", function (error, response, body) { var article = JSON.parse(body), articleTitle = article.title || "", articleText = (article.text || "").toLowerCase(); addKeywords(tokenizer.tokenize(articleTitle + articleText)); saveLastId(articleId); console.log('Read article #' + article.id + ". Type: " + article.type); fetchRecursive(articleId + 1, remaining - 1); });
fs.readFile(fileName, 'utf8', function(err, data) { if (err) { return console.log('There was an error: ' + err); } // find and print number of lines var fileLinesInArray = data.split('\n'); var numberOfLines = fileLinesInArray.length - 1; console.log(numberOfLines + " lines."); // find and print number of characters var characters = data.split(''); var numberOfCharacters = characters.length; console.log(numberOfCharacters + " characters."); // find and print number of words using natural package var tokenizer = new natural.WordTokenizer(); var wordsArray = tokenizer.tokenize(data); var numberOfWords = wordsArray.length; console.log(numberOfWords + " words"); });
Index.prototype.add = function (filename, document, callback) { var self = this; var PUNCTUATION = ['.', ',', ':', '']; var tokenizer = new natural.WordTokenizer(); var tokens = tokenizer.tokenize(document); // TODO: Remove stop words var tasks = tokens.filter(function (token) { return PUNCTUATION.indexOf(token) === -1; }).map(function (token) { return function (cb) { self.tokenClient.sadd(token, filename, cb); }; }); tasks.push(function (cb) { self.tokenClient.set(filename, document, cb); }); async.parallel(tasks, callback); };
this.classifyNN = function(net, msg){ if(!net) throw _argumentError(0, 'net'); if(!msg) throw _argumentError(1, 'msg'); var msgTokens = tokenizer.tokenize(msg) , input = {} msgTokens.forEach(function (e,i,a){ input[e] = 1; }); return net.run(input); }
tagSchema.statics.tokenize = function (tagString) { var results = [], words = tokenizer.tokenize(tagString); for (var w in words) { var word = words[w].toLowerCase(); if (useless[word] === undefined) { results.push(natural.PorterStemmer.stem(word)); } } return results; };
var organizeWords = function(text){ //text is a string //return a dictionary/object var words = {}; var tokenizer = new natural.WordTokenizer(); natural.PorterStemmer.attach(); var imptWords = text.toLowerCase().tokenizeAndStem(); var textArr = tokenizer.tokenize(text); //add real words into words object with count of 0 for(var j=0; j< imptWords.length; j++){ if(eng_words.indexOf(imptWords[j])>-1) words[imptWords[j]] = 0; } for(var i=0; i< textArr.length; i++){ var root = natural.PorterStemmer.stem(textArr[i]); if(Object.keys(words).indexOf(root) > -1) words[root] += 1; } return words; };
// prepares keyword list to use in findManyDataMap (where grouping categories are NOT important) prepareArticlekeywords(articles) { let sourceList = articles; let resultList = []; for (let k in sourceList) { let cleaned = this.prepareClean(sourceList[k]).toLowerCase(); let tokenized = tokenizeOnly ? tokenizer.tokenize(cleaned) : cleaned.tokenizeAndStem(true); resultList.push({ key: tokenized.join(' '), span: tokenized.length }); } return resultList; }
function grabTopics(text) { var datesCompacted = classifier_1.runThroughClassifiers(text, dateClassifiers); var datesGrouped = _.groupBy(datesCompacted, 'topic'); var specials = _.compact(tokenizer.tokenize(text).filter(function (token) { return !isNaN(parseInt(token, 10)); })); var intent = { action: null, details: { dates: _.mapValues(datesGrouped, function (classifications) { return classifications.map(function (classification) { return _.startCase(classification.label); }); }), specialWords: specials, locations: locatonExtractor(text), }, topic: 'details', }; // if (this && this.debugOn) { console.log('details intent', util.inspect(intent, { depth: null })); }; return Promise.resolve(intent); }