Ejemplo n.º 1
0
function processText(text)
{
    tokenizer=new natural.WordTokenizer()
    tokens=tokenizer.tokenize(text)

    return tokens
}
Ejemplo n.º 2
0
function similarity(s1, s2) {
	var w1s = tokenizer.tokenize(s1);
	var w2s = tokenizer.tokenize(s2);
	var is = _.intersection(w1s, w2s);
	var commoncount = is.length;
	if (commoncount === 0) return 0;
	var res = (commoncount / ( log10(w1s.length) + log10(w2s.length) ) );
	if (isNaN(res)){
		throw 'NaN';
	}
	return res;
}
Ejemplo n.º 3
0
module.exports = function summarize(html) {
  var data = unfluff(html);
  var tokens = tokenizer.tokenize(data.text).length;
  var wordcounts = glossary.extract(data.title + '. ' + data.text);
  var keywords = _.chain(wordcounts)
    .filter(function(wc) { return wc.norm.length > 2 })
    .sortBy('count')
    .reverse()
    .pluck('norm')
    .value();

  if (tokens > 5 && keywords.length > 2) {
    var sent = sentiment(data.text);
    var stats = new Stats(data.text);
    var difficulty = stats.smogIndex() / 12;
    var wpm = (200 - 100 * difficulty) || 1;
    var minutes = Math.ceil(tokens / wpm);

    return _.extend({}, DEFAULTS, {
      ok: true,
      sentiment: sent.comparative,
      title: data.title,
      topics: keywords,
      words: tokens,
      difficulty: difficulty,
      minutes: minutes,
      image: data.image
    });
  }

  return _.extend({}, DEFAULTS);
};
        .on('line', (tweet) => {
            onData(tweet);
            let prevLength = tweets.length,
                tempTweet = JSON.parse(tweet);
            tweets = _.union(tweets, [tempTweet.d]); //union doesn't adds something that has already been added
            let newLength = tweets.length;

            if(prevLength < newLength) { //which is what we want to start processing
                tempTweet.d = _.replace(tempTweet.d, new RegExp(regexes.join('|'), "gi"),'');
                let tempTokens = tokenizer.tokenize(
                        emojiStrip(
                            utf8.encode(tempTweet.d.toLowerCase())
                        )
                    ),
                    tempIntersection = _.intersection(tempTokens, slanKeys);
                if(tempIntersection.length > 0) {
                    tempTweet.d = tempTokens.join(" ");
                    for (let i = tempIntersection.length - 1; i >= 0; i--) {
                        tempTweet.d = tempTweet.d.replace(
                            tempIntersection[i],
                            slan[tempIntersection[i]]
                        )
                    }
                    tempTokens = tempTweet.d.split(" ");
                }
                tokenWriteStream.write(JSON.stringify({tok: _.difference(tempTokens, stopWords), t: tempTweet.t}) + "\n");
            }
        })
Ejemplo n.º 5
0
	var getTags = function(msg, cb){
		tags = [];
		var a = tokenizer.tokenize(msg);

		wpos.getPOS(a, function(results){
			results = _.omit(results, 'rest');
			_.each(results, function(result){
				if(result.length > 0){
					var words={};
					for (var i = result.length - 1; i >= 0; i--) {
						if(!words.hasOwnProperty(result[i])){
							words[result[i]] = {
								tag: result[i],
								count: 1
							}
						} else {
							words[result[i]].count += 1;
						}
						
					};
					var m = _.max(words, function(noun){
						return noun.count;
					}).tag;

					if(!_.contains(tags, m)){
						tags.push(m);
						console.log("push!");
					}
				}
			});
			console.log(tags+" The tags!");
			return cb(tags); //array of tokenized tags
		});
		
	}
Ejemplo n.º 6
0
var interpreter = function(data) {
  // if it is a zip, assign to user
  if(parseInt(data.content)){
    data.user.zipCode = parseInt(data.content)
  }

  // split into array
  sentence = tokenizer.tokenize(data.content)
  // add language tags (noun, verb, etc.)
  var taggedSentence = tagger.tag(sentence)
  // Sort out nouns of interests (things to be donated/disposed)
  var itemsToDispose = grabThings(taggedSentence)

  if (data.user.zipCode){
    var response = locationToResponse(locations[0], itemsToDispose.join(','))

    // var response = "Here is a link where you can find information about where to get rid of your " + itemsToDispose.join(',') + " : http://www.recycleworks.org/";
    data.user.itemsToDispose = null
    return response
  } else {
    return 'what is your zipcode?'
  }


};
Ejemplo n.º 7
0
function getLocationConfidence(text, searchLocation) {
    var matchingCities = _.map(locations, function (subLocations, key) {
        // console.log(subLocations);
        return _.map(subLocations, function (cities) {
            // console.log(cities, searchLocation);
            var normalizedCities = cities.map(function (city) { return city.toLowerCase(); });
            return _.includes(normalizedCities, searchLocation.toLowerCase()) ? normalizedCities : null;
        });
    });
    var cityList = _.compact(_.flattenDeep(matchingCities));
    var allPhrases = createTokens(text);
    var matchingPhrase = _.intersection(allPhrases, cityList);
    var textTokenized = tokenizer.tokenize(text);
    var locationTokenized = tokenizer.tokenize(matchingPhrase[0]);
    return locationTokenized.length / textTokenized.length;
}
Ejemplo n.º 8
0
  // prepares keywords to use in findDataMap (where the grouping categories are important)
  prepareAllKeywordsMap(stopKW) {
    let startMap = require('../resources/' + this.locale + '/strings/commands-keywords');

    let resultMap = {};
    for (let group in startMap) {
      let groupMap = {};
      let startMapGroup = startMap[group];
      for (let cat in startMapGroup) {
        let catList = [];
        let startCatList = startMapGroup[cat];
        for (let k in startCatList) {
          let cleaned = this.prepareClean(startCatList[k]).toLowerCase();
          let tokenized = tokenizeOnly ? tokenizer.tokenize(cleaned) : cleaned.tokenizeAndStem(true);
          catList.push({
            key: tokenized.join(' '),
            span: tokenized.length
          });
          stopKW.push(tokenized.join(' '));
        }
        groupMap[cat] = catList;
      }
      resultMap[group] = groupMap;
    }

    return resultMap;
  }
Ejemplo n.º 9
0
  // returns ALL matches in 'phrase' to keywords listed in 'opts'
  // returns a map where keys are the known keyword, and values are a list of matches, as in:
  //  subject: {
  //	 'change diaper': [ 'change diaper/change diaper(0/3)' ],
  //   diapers: [ 'diapers/diaper(1/2)' ],
  //   poop: [ 'poop/poop(0/1)', 'poop/poop(0/1)', 'poop/poop(0/1)' ]
  //	}
  findManyDataMap(phrase, opts, threshold) {

    let cleanPhrase = this.prepareClean(phrase);

    let tokens = tokenizeOnly ? tokenizer.tokenize(cleanPhrase) : cleanPhrase.tokenizeAndStem(true);

    let found = {};

    for (let i in tokens) {
      for (let o in opts) {

        let opt = opts[o];
        let thres = (opt.key.length > 4) ? Math.max(Math.floor(opt.key.length * threshold),1) : 0;

        let sourceStr = tokens.slice(parseInt(i), parseInt(i) + opt.span).join(' ');

        let targetStr = opt.key;
        let lev = new Levenshtein(sourceStr, targetStr);
        if (debug) {
          console.log(tokens + ', opt: ' + targetStr + ', src: ' + sourceStr + ', thres: ' + thres + ', dist: ' + lev.distance);
        }

        if (lev.distance <= thres) {
          if (!found[targetStr]) {
            found[targetStr] = [];
          }

          let prevMatches = found[targetStr];
          prevMatches.push(targetStr + '/' + sourceStr + '(' + lev.distance + '/' + thres + ')');
        }
      }
    }

    return (Object.keys(found).length > 0) ? found : null;
  }
Ejemplo n.º 10
0
                        o.forEach(function(x) {
                            var tags = ($N.objTags(x)||[]).map(function(t) {
                                return '#' + t;
                            });

                            var desc = $N.objDescription(x);
                            if (desc!==undefined) {
                                $ = cheerio.load(desc);
                                desc = $.text();
                            }
                            else
                                desc = '';

                            var text = (x.name||'') + ' ' + desc;

                            var terms = tokenizer.tokenize(text.trim()).map(function(w) {
                                return w.toLowerCase();
                            });
                            terms = _.difference(terms, sw);

                            var tokens = _.union(tags,terms);

                            if (tokens.length > 0)
                                lda.addDocument(x.id, tokens);
                        });
Ejemplo n.º 11
0
			reply.forEach(function(tweet){
				var t = tweet.text;
				var no_at = t.replace(/@\w+/g, '');		//remove '@username'
				var no_space = no_at.replace(/^\s+|\s+$/g, '').toLowerCase();;	//remove leading left or right spaces
				var tokenize_string = tokenizer.tokenize(no_space);
				wstream.write(tokenize_string.toString() + "\n");
			});
Ejemplo n.º 12
0
	_.each(wordList, function(sentence) {
		var words = tokenizer.tokenize(sentence);
		_.each(words, function(word) {
			if(word.length > 2) {
				self.addWord(word);
			}
		});
	});
Ejemplo n.º 13
0
function prepText(text) {
  if (_.isArray(text)) return text;
  var deduped = _.uniq(tokenizer.tokenize(text));
  if (!this.options.stopwords) return deduped;
  return _.reject(deduped, _.bind(isStopword, null,
      _.isString(this.options.stopwords) ? this.options.stopwords : natural_stopwords
      ));
}
Ejemplo n.º 14
0
function createTokens(text) {
    var tokenized = tokenizer.tokenize(text);
    var biGrams = NGrams.ngrams(tokenized, 2).map(function (words) { return words.join(' '); });
    var triGrams = NGrams.ngrams(tokenized, 3).map(function (words) { return words.join(' '); });
    var ngrams = _.flatten(tokenized.concat(biGrams, triGrams));
    var allPhrases = ngrams.map(function (phrase) { return phrase.toLowerCase(); });
    return allPhrases;
}
Ejemplo n.º 15
0
function get_other_account(query){
	tokenizer = new natural.WordTokenizer();
	tokens = tokenizer.tokenize(query);
	other_account = '';
	for (i=0;i<tokens.length;i++){
		token = tokens[i];
		if (token == "from"){
			if (i>=1 && (tokens[i-1] == "payments") || (tokens[i-1] == "Payments")){
				if (i+1 < tokens.length){
					other_account = tokens[i+1];
					return other_account;
				}
			}
		}
	}

}
Ejemplo n.º 16
0
function matchRE(re, text) {
  var wordArray = tokenizer.tokenize(text);
  for (var i = 0; i < wordArray.length; i++) {
    if (re.test(wordArray[i])) {
      return true;
    }
  }
  return false;
}
Ejemplo n.º 17
0
function strToDoc(str){
    var arr = (''+str).toLowerCase();
	arr = arr.replace(/ä/g, 'a');
	arr = arr.replace(/ö/g, 'o');
	arr = arr.replace(/ü/g, 'u');
    arr = tokenizer.tokenize(arr)
    arr = stemmer.stemm(arr);
    return arr;
}
describe('Brill\'s POS Tagger', function() {
  var brill_pos_tagger;
  it('should initialise correctly with tagging rules for English', function(done) {
    brill_pos_tagger = new Brill_POS_Tagger(en_lexicon_file, en_rules_file, 'NN', function(error) {
      done();
    });
  });

  var sentences;
  it('should correctly read a NYT article about Picasso', function(done) {
    fs.readFile(en_ex_nyt_article, 'utf8', function (error, text) {
      sentences = text.split('\n');
      done();
    });
  });
  
  var posjs_results;
  it('should correctly read tag results of pos-js for the NYT article', function(done) {
    fs.readFile(en_ex_nyt_article_expected_tag_results, 'utf8', function (error, text) {
      posjs_results = JSON.parse(text);
      done();
    });
  });
  
  var tokenizer = new natural.WordTokenizer();

  it('should process the article just like the old pos module', function() {
    sentences.forEach(function(sentence, index) {
      var tokenized_sentence = tokenizer.tokenize(sentence);
      var taggedWords = brill_pos_tagger.tag(tokenized_sentence);
      expect(taggedWords).toEqual(posjs_results[index]);
    });
  });
  
  it('should initialise correctly with tagging rules for Dutch', function(done) {
    brill_pos_tagger = new Brill_POS_Tagger(du_lexicon_file, du_rules_file, 'N', function(error) {
      done();
    });
  });
  
  it('should correctly read a Volkskrant article about the ECB', function(done) {
    fs.readFile(du_ex_volkskrant_article, 'utf8', function (error, text) {
      sentences = text.split('\n');
      done();
    });
  });
  
  it('should process the Volkskrant article', function() {
    sentences.forEach(function(sentence, index) {
      var tokenized_sentence = tokenizer.tokenize(sentence);
      var taggedWords = brill_pos_tagger.tag(tokenized_sentence);
      expect(tokenized_sentence.length).toEqual(taggedWords.length);
      console.log(taggedWords);
    });
  });
  
});
Ejemplo n.º 19
0
app.get("/api/1/messages/natural", function (req, res) {
	  console.log("natural language processing with"+req.query.q);
	  var natural = require('natural');
	  var sentiment=require('sentiment');
	  var nltk={};
	  tokenizer = new natural.WordTokenizer();
	  nltk.token=tokenizer.tokenize(req.query.q);
	  natural.LancasterStemmer.attach();
	  nltk.stem=req.query.q.tokenizeAndStem();
	  var resp=sentiment(req.query.q);
	  var sentimentValue="NEUTRAL";
	  if(resp.score > 0){
		  sentimentValue="POSITIVE";
	  }else if(resp.score < 0){
		  sentimentValue="NEGATIVE";
	  }
	  nltk.sentiment=sentimentValue;
	  console.log(nltk)
	  res.send(nltk);
	});
Ejemplo n.º 20
0
 var filteredResponses = _.filter(responses, function(response) {
   var tokens = tokenizer.tokenize(response.message)
   // If the origin and destination match and it's not a 'looking' post,
   // add it to the existing matches
   if (isMatch(origin, destination, tokens) && !Util.stringMatch(tokens, 'looking')) {
     return true   
   }
   else {
     return false
   }
 })
Ejemplo n.º 21
0
  this.addToDataset = function (text, next) {
    let wordArr = tokenizer.tokenize(text);

    tagWords(wordArr, (err, resp) => {
      if (err) {
        return next(err);
      }

      let POSArr = JSON.parse(resp);
      updateDataset(POSArr, next);
    });
  };
Ejemplo n.º 22
0
  /**
   * Runs all the plugins that power Chevy's decision making process
   * in a promise chain
   * @param  {[object]} context [conversation context]
   * @return {[null]}
   */
  think(context) {
    // tokenize the query
    context.queryTokens = this.tokenizer.tokenize(context.query)

    return conversation(context)
    .then(function(context) {
      return search(context)
    })
    .then(function(context) {
      return action(context)
    })
  }
	request("https://hacker-news.firebaseio.com/v0/item/" + articleId + ".json?print=pretty", function (error, response, body) {
		var article = JSON.parse(body),
			articleTitle = article.title || "",
			articleText = (article.text || "").toLowerCase();

		addKeywords(tokenizer.tokenize(articleTitle + articleText));

		saveLastId(articleId);

		console.log('Read article #' + article.id + ". Type: " + article.type);
		fetchRecursive(articleId + 1, remaining - 1);		
	});
fs.readFile(fileName, 'utf8', function(err, data) {
  if (err) {
    return console.log('There was an error: ' + err);
  }

  // find and print number of lines
  var fileLinesInArray = data.split('\n');
  var numberOfLines = fileLinesInArray.length - 1;
  console.log(numberOfLines + " lines.");

  // find and print number of characters
  var characters = data.split('');
  var numberOfCharacters = characters.length;
  console.log(numberOfCharacters + " characters.");

  // find and print number of words using natural package
  var tokenizer = new natural.WordTokenizer();
  var wordsArray = tokenizer.tokenize(data);
  var numberOfWords = wordsArray.length;
  console.log(numberOfWords + " words");
});
Index.prototype.add = function (filename, document, callback) {
  var self = this;
  var PUNCTUATION = ['.', ',', ':', ''];
  var tokenizer = new natural.WordTokenizer();
  var tokens = tokenizer.tokenize(document);

  // TODO: Remove stop words

  var tasks = tokens.filter(function (token) {
    return PUNCTUATION.indexOf(token) === -1;
  }).map(function (token) {
    return function (cb) {
      self.tokenClient.sadd(token, filename, cb);
    };
  });

  tasks.push(function (cb) {
    self.tokenClient.set(filename, document, cb);
  });

  async.parallel(tasks, callback);
};
Ejemplo n.º 26
0
    this.classifyNN = function(net, msg){
      if(!net) throw _argumentError(0, 'net');
      if(!msg) throw _argumentError(1, 'msg');

      var msgTokens = tokenizer.tokenize(msg)
        , input = {}

      msgTokens.forEach(function (e,i,a){
        input[e] = 1;
      });

      return net.run(input);
    }
Ejemplo n.º 27
0
    tagSchema.statics.tokenize = function (tagString) {
        var results = [],
            words = tokenizer.tokenize(tagString);

        for (var w in words) {
            var word = words[w].toLowerCase();

            if (useless[word] === undefined) {
                results.push(natural.PorterStemmer.stem(word));
            }
        }

        return results;
    };
Ejemplo n.º 28
0
var organizeWords = function(text){
	//text is a string
	//return a dictionary/object
	var words = {};
	var tokenizer = new natural.WordTokenizer();
	natural.PorterStemmer.attach();

	var imptWords = text.toLowerCase().tokenizeAndStem();
	var textArr = tokenizer.tokenize(text);

	//add real words into words object with count of 0
	for(var j=0; j< imptWords.length; j++){
		if(eng_words.indexOf(imptWords[j])>-1)
			words[imptWords[j]] = 0;
	}

	for(var i=0; i< textArr.length; i++){
		var root = natural.PorterStemmer.stem(textArr[i]);
		if(Object.keys(words).indexOf(root) > -1)
			words[root] += 1;
	}
	return words;
};
Ejemplo n.º 29
0
  // prepares keyword list to use in findManyDataMap (where grouping categories are NOT important)
  prepareArticlekeywords(articles) {
    let sourceList = articles;
    let resultList = [];

    for (let k in sourceList) {
      let cleaned = this.prepareClean(sourceList[k]).toLowerCase();
      let tokenized = tokenizeOnly ? tokenizer.tokenize(cleaned) : cleaned.tokenizeAndStem(true);
      resultList.push({
        key: tokenized.join(' '),
        span: tokenized.length
      });
    }

    return resultList;
  }
Ejemplo n.º 30
0
function grabTopics(text) {
    var datesCompacted = classifier_1.runThroughClassifiers(text, dateClassifiers);
    var datesGrouped = _.groupBy(datesCompacted, 'topic');
    var specials = _.compact(tokenizer.tokenize(text).filter(function (token) { return !isNaN(parseInt(token, 10)); }));
    var intent = {
        action: null,
        details: {
            dates: _.mapValues(datesGrouped, function (classifications) { return classifications.map(function (classification) { return _.startCase(classification.label); }); }),
            specialWords: specials,
            locations: locatonExtractor(text),
        },
        topic: 'details',
    };
    // if (this && this.debugOn) { console.log('details intent', util.inspect(intent, { depth: null })); };
    return Promise.resolve(intent);
}