app.get("/api/1/messages/natural", function (req, res) { console.log("natural language processing with"+req.query.q); var natural = require('natural'); var sentiment=require('sentiment'); var nltk={}; tokenizer = new natural.WordTokenizer(); nltk.token=tokenizer.tokenize(req.query.q); natural.LancasterStemmer.attach(); nltk.stem=req.query.q.tokenizeAndStem(); var resp=sentiment(req.query.q); var sentimentValue="NEUTRAL"; if(resp.score > 0){ sentimentValue="POSITIVE"; }else if(resp.score < 0){ sentimentValue="NEGATIVE"; } nltk.sentiment=sentimentValue; console.log(nltk) res.send(nltk); });
politicianDB.view('analysis', 'tweets_month_date', params, function (err, body) { var list = []; var wordsCount = {}; if (!err) { console.log("trendyTopic") console.log(req.query.keyword); var stopWords = ['i', 'and', 'you', ',', '.', '?', 'will', 'just', 'http', 'co', 'it', 'the', 'a', 'so', 'today', '1', '2', '3', '4','25','10','12', '6', 'in', 'at', 'rt', 'on']; natural.LancasterStemmer.attach(); var hashtagRegexp = /#([a-zA-Z0-9]+)/g; body.rows.forEach(function (doc) { console.log(doc); var months = doc.key[1]; var temp = doc.value.match(hashtagRegexp); console.log(temp); //var temp = tokenizer.tokenize(doc.value); _.each(temp, function (word) { if (word.toLowerCase().indexOf(keyword.toLowerCase()) > -1) { if (months in wordsCount){ wordsCount[months] += 1; } else { wordsCount[months] = 1; } } }); }); var tuples =[]; for (var key in wordsCount) tuples.push([key, wordsCount[key]]); list = tuples res.send(200, list); } else { console.log(err); res.send(200, []); } });
request(url, function (error, response, body) { if (!error && response.statusCode == 200) { var text = stripHTML(body); // var keywords = getKeywords(body); natural.LancasterStemmer.attach(); var tokens = text.tokenizeAndStem(); var freq = {}, importantWord = '', importantFreq = 0; for (var i = 0; i < tokens.length; i++) { if (/[\d]+/.test(tokens[i]) || tokens[i]==='undefined' || tokens[i]==='nbsp' || tokens[i].length > 40) break; freq[tokens[i]] = freq[tokens[i]] || 0; freq[tokens[i]]++; }; for (var i = 0; i < tokens.length; i++) { if (freq[tokens[i]] > importantFreq){ importantWord = tokens[i]; importantFreq = freq[tokens[i]]; } }; res.writeHead(200, headers); res.end(importantWord); } })
var search = require('./search.js'); var COURSE_REGEX = /[a-z]{4}[0-9]{3}/gi; var natural = require('natural') var jsonfile = require('jsonfile') var fs = require('fs'); var WORDS = require('../WORDS.js').WORDS; natural.LancasterStemmer.attach(); var NGrams = natural.NGrams; var path = require('path'); var COURSE_REGEX = /[a-z]{4}[0-9]{3}/gi; // useful functions Array.prototype.unique = function(){ // from underscore.js var u = {}, a = []; for(var i = 0, l = this.length; i < l; ++i){ if(u.hasOwnProperty(this[i])) { continue; } a.push(this[i]); u[this[i]] = 1; } return a; } Array.prototype.contains = function(element){ return this.indexOf(element) > -1; };
politicianRelationshipDB.view('analysis', 'tweets_by_name', params, function (err, body) { var wordsMap = {}; if (!err) { // var stopWords = ['i', 'and', 'you', ',', '.', '?', 'will', 'just', 'http', 'co', 'it', 'the', 'a', 'so', 'today', // '1', '2', '3', '4','25','10','12', '6', 'in', 'at', 'rt', 'on']; natural.LancasterStemmer.attach(); var mentionRegexp = /@([a-zA-Z0-9]+)/g; body.rows.forEach(function (doc) { var temp = doc.value.match(mentionRegexp); //var temp = tokenizer.tokenize(doc.value); _.each(temp, function (word) { if (word.toLowerCase() in wordsMap) { wordsMap[word.toLowerCase()] += 1; } else { wordsMap[word.toLowerCase()] = 1; } }); }); console.log('wordsMap=' + wordsMap); var n = 0, m = 0; var list0 = []; var list = []; var tuples = []; for (var key in wordsMap) tuples.push([key, wordsMap[key]]); tuples.sort(function(a, b) { a = a[1]; b = b[1]; return a < b ? 1 : (a > b ? -1 : 0); }); for (var i = 0; i < tuples.length; i++) { var key = tuples[i][0]; var value = tuples[i][1]; list.push(key); // if(value > 1) { // // list.push({name: key, size: value}); // list.push(key); // n++; // } // if (n % 2 == 0) { // m++; // list0.push({name: 'group' + m, children: list}); // console.log(list); // list = []; // } if (i == 500) { break; } } // _.each(sortedWordsMap, function (count, word) { // // }); console.log(list); res.send(200, list); } else { console.log(err); res.send(200, []); } });
function normalize(text) { natural.LancasterStemmer.attach(); // maybe need to remove urls too return text.tokenizeAndStem(); }
politicianRelationshipDB.view('analysis', 'tweets_by_name', params, function (err, body) { var wordsMap = {}; if (!err) { //console.log('body.rows=' + body.rows); var stopWords = ['i', 'and', 'you', ',', '.', '?', 'will', 'just', 'http', 'co', 'it', 'the', 'a', 'so', 'today', '1', '2', '3', '4','25','10','12', '6', 'in', 'at', 'rt', 'on','https','he','she','no','not','is','are','am','yes','how','when','why','what', 'via','think','let','day','wil','if','go','ask','tell','off','amp','back','good','say','speak','read','that','which','want','from','ok','about', 'please','need','we','now']; natural.LancasterStemmer.attach(); body.rows.forEach(function (doc) { var temp = doc.value.tokenizeAndStem(); //var temp = tokenizer.tokenize(doc.value); _.each(temp, function (word) { if (!_.contains(stopWords, word.toLowerCase())) { if (word.toLowerCase() in wordsMap) { wordsMap[word.toLowerCase()] += 1; } else { wordsMap[word.toLowerCase()] = 1; } } }); }); //console.log('wordsMap=' + wordsMap); var n = 0, m = 0; var list0 = []; var list = []; var tuples = []; for (var key in wordsMap) tuples.push([key, wordsMap[key]]); tuples.sort(function(a, b) { a = a[1]; b = b[1]; return a < b ? 1 : (a > b ? -1 : 0); }); for (var i = 0; i < tuples.length; i++) { var key = tuples[i][0]; var value = tuples[i][1]; if(value > 1) { list.push({name: key, size: value}); n++; } if (n % 10 == 0) { m++; list0.push({name: 'group' + m, children: list}); console.log(list); list = []; } if (i == 200) { break; } } // _.each(sortedWordsMap, function (count, word) { // // }); console.log(list0); if(list0.length == 0) { res.send(200, []); } else { res.send(200, {name: 'flare', children: list0}); } } else { console.log(err); res.send(200, []); } });
politicianRelationshipDB.view('analysis', 'tweets_by_name', params, function (err, body) { var wordsMap = {}; if (!err) { //console.log('body.rows=' + body.rows); var stopWords = ['i', 'and', 'you', ',', '.', '?', 'will', 'just', 'http', 'co', 'it', 'the', 'a', 'so', 'today', '1', '2', '3', '4','5','6','7','8','9','0', 'in', 'at', 'rt', 'on','amp']; natural.LancasterStemmer.attach(); // var hashtagRegexp = /^[a-zA-Z0-9]([a-zA-Z0-9]+)/g; body.rows.forEach(function (doc) { // var temp = doc.value.match(hashtagRegexp); var temp = doc.value.tokenizeAndStem(); //var temp = tokenizer.tokenize(doc.value); _.each(temp, function (word) { if (!_.contains(stopWords, word.toLowerCase())) { if (word.toLowerCase() in wordsMap) { wordsMap[word.toLowerCase()] += 1; } else { wordsMap[word.toLowerCase()] = 1; } } }); }); var tuples =[]; var list =[]; for (var key in wordsMap) tuples.push([key, wordsMap[key]]); tuples.sort(function (a, b) { console.log(a[1]); console.log(b[1]); return (b[1] - a[1]); }); // _.each(tuples, function (i,item) { // console.log(item); // if (i > 50){return false} // }); var n = 0; for (var i = 0; i < tuples.length; i++) { var key = tuples[i][0]; var value = tuples[i][1]; if(value > 1) { console.log(key); console.log(value); n++; list.push([key]); } if (n>5){break;} } res.send(200, list); } else { console.log(err); res.send(200, []); } });
politicianDB.view('analysis', 'geolocation', function (err, body) { var list = []; var list0= []; var name_max = ''; var number_max = 0; var name =''; var found = 0; if (!err) { console.log("keyword=" + keyword); var stopWords = ['i', 'and', 'you', ',', '.', '?', 'will', 'just', 'http', 'co', 'it', 'the', 'a', 'so', 'today', '1', '2', '3', '4','25','10','12', '6', 'in', 'at', 'rt', 'on','amp']; natural.LancasterStemmer.attach(); var hashtagRegexp = /([a-zA-Z0-9]+)/g; var wordCount ={}; body.rows.forEach(function (doc) { found = 0; var temp = doc.value.match(hashtagRegexp); name = doc.key[1]; console.log("name=" + name); //var temp = tokenizer.tokenize(doc.value); // if (keyword.toLowerCase() in temp){ _.each(temp, function (word) { if (word.toLowerCase().indexOf(keyword.toLowerCase()) > -1) { console.log("word=" + word); found = 1; if (name in wordCount) { wordCount[name] += 1; } else { wordCount[name] = 1; } } }); if (found == 1){ console.log("doc found=" + doc.value); list0.push({lat:doc.key[0][0], lng:doc.key[0][1], name:doc.key[1], content:doc.value}); } // _.each(temp, function (word) { // if (word.toLowerCase().indexOf(keyword.toLowerCase()) > -1) { // if (months in wordsCount){ // wordsCount[months] += 1; // } else { // wordsCount[months] = 1; // } // } // }); }); // wordCount.sort(function (a, b) { // return (b.value - a.value); // }); // for (var key in wordCount){ // if ( wordCount[key] > number_max){ // name_max = key; // number_max = wordCount[key]; // } // } // console.log("name_max=" + name_max); // // body.rows.forEach(function (doc) { // _.each(list0, function(item){ // console.log('item=' + item.lat + ' ' + item.lng + ' '+ item.name + ' '+ item.content); // console.log('item name=' + item.name); // if (name_max == item.name ){ // // list.push({lat:doc.key[0][0], lng:doc.key[0][1], name:doc.key[1], content:doc.value}); // console.log('inside item=' + item.lat + ' ' + item.lng + ' '+ item.name + ' '+ item.content); // list.push({lat:item.lat, lng:item.lng, name:item.name, content:item.content}); // } // }); //console.log(list); res.send(200, list0); } else { console.log(err); res.send(200, []); } });
stream._transform = function (chunk, enc, done) { done(null, JSON.stringify(natural.LancasterStemmer.tokenizeAndStem(chunk.toString()))); stream.emit("data", JSON.stringify(natural.LancasterStemmer.tokenizeAndStem(chunk.toString()))) };