var full_text_datasets = []; // 存放整個文章輸入wordnet後的json //格式為{ {第一個word的同義詞資訊}, // {第二個word的同義詞資訊}, // {第三個word的同義詞資訊}, // ...... } // 每一個同義詞資訊是一個object array,其中每個元素都是他的同義詞 function SearchingWordNet( userSearchString, res, req ) { var natural = require('natural'); var wordnet = new natural.WordNet('/usr/local/lib/node_modules/WNdb/dict'); var i = 0 ; //將使用者從text area輸入的文章切token,先把逗號和句號替換成空白,再用空白切 //可能的bug:萬一有縮寫用到句號的就會被切散成分開的字串 replace_comma = String(req.body.userSearchString).replace(/\,/g,' '); replace_dot = replace_comma.replace(/\./g,' '); temp_token_sets = replace_dot.split(" "); inputs_into_token = [] ; full_text_datasets = []; // 不知道為甚麼split會把雙空白相連的這兩個字元的其中一個當作是內文。。。 // 還有如果直接del array[3]之類的,array總長度還是不會變(javascript有夠笨 = =) // ex: var A = ["1", "2", "3"]; // del A[1]; // 印出A會變成:["1", , "3"] // 所以重新建立一個新的把不是空的元素丟進去 for ( var i = 0 ; i < temp_token_sets.length ; i++) if ( temp_token_sets[i].length != 0 ) inputs_into_token.push(temp_token_sets[i]); //console.log('///' + replace_comma + '\n-------------------'); //console.log(replace_dot + '\n-------------------'); //console.log(inputs_into_token.length + '\n-------------------'); //console.log(inputs_into_token + '\n-------------------'); // inputs_into_token裏面正常的token,可以開始塞進wordnet //然後一一餵給wordnet拿出資料 //並存入json檔中 for ( walk = 0 ; walk < inputs_into_token.length ; walk++ ){ current = inputs_into_token[walk]; // 將使用者輸入的字切成token後送到wordnet去抓同義字資訊 wordnet.lookup(current, function(results) { // 每個字可能有很多同義字,每個同義字是一個one_wordInfo的object //用foreach把所有的這種object塞到wordnetDatas的array results.forEach(function(result) { one_wordInfo = new WordInfo(result.wordInfo, result.synsetOffset, result.pos, result.lemma, result.synonyms, result.gloss ); wordnetDatas.push(one_wordInfo); //console.log('#$$ ' + JSON.stringify(one_wordInfo)); }); // forEach } // function(results) ); // lookup full_text_datasets.push(wordnetDatas); //console.log('=======\n' + JSON.stringify(wordnetDatas)); //console.log('********\n' + JSON.stringify(full_text_datasets)); } // for console.log('$' + wordnetDatas); setTimeout(function(){ res.render('index',{ title: 'NTNU Bioinformatics courses', wordnetDatas: wordnetDatas, targetStr : 'You have searched : '+ userSearch }); }, 5000); wordnetDatas = []; full_text_datasets = []; } // SearchingWordNet()
function SearchingWordNet( userSearchString, res ) { var natural = require('natural'); var wordnet = new natural.WordNet('/usr/local/lib/node_modules/WNdb/dict'); var i = 0 ; wordnet.lookup(String(userSearch), function(results) { results.forEach(function(result) { var one_wordInfo = new WordInfo(result.synsetOffset, result.pos, result.lemma, result.synonyms, result.gloss ); //json = one_wordInfo.toJSONString(); //wordnetDatas[i] = JSON.parse(json); wordnetDatas[i] = one_wordInfo; i++; }); //return wordnetDatas; res.render('index',{ title: 'NTNU Bioinformatics courses', wordnetDatas: wordnetDatas, targetStr : 'You have searched : '+ userSearch }); }); } // SearchingWordNet()
var WordNetter = function(seed,level) { this.name = seed; if(level == 0) return; this.children = []; var netter = this; wn.lookup(this.name,function (results) { if(results.length == 0) { return; } // var result = findMostSynResult(results);c results.forEach(function(result) { if(result.synonyms.length == 0) { return; } result.synonyms.forEach(function (synonym) { if (used.indexOf(synonym) == -1 && trie.contains(synonym) ) { used.push(synonym); netter.children.push( new WordNetter(synonym, level - 1) ); console.log("!"); } }); }); }); function findMostSynResult(results) { var most = results[0]; for (var i = results.length - 1; i >= 0; i--) { if(results[i].synonyms.length > most.synonyms.length) most = results[i]; }; return most; }; };
// Feature not currently supported // At the end of a response look at the newWords and generate a sentiment score // The new score is based on the average sentiment score of each peice of text with the word function trainWords(){ for (var key in newWords){ var wordPos = []; // Only look at adjectives wordnet.lookup(key, function(results){ var i = 0; results.forEach(function(result){ var found = false; if (result.pos == "a"){ i++; } if (i > 0){ wordPos.push(i); console.log(wordPos); } }); }); // If the word has been mentioned only two times ignore it if (newWords[key].count < 2){ delete newWords[key]; } else { // Compute the score for the new word newWords[key].SA = (newWords[key].SA/newWords[key].count); } } }
function wordnetsyn(word, callback) { var out = [] wordnet.lookup('offer', function(results) { out.concat = out.concat(results['synonyms']) callback(null, out) }) }
var synset = function (word) { wordnet.lookup(word, function (results) { // console.log(word); // console.log(results[0].synonyms); return results[0].synonyms; }); }
exports.define = function (word, cb) { wordnet.lookup(word, function (results) { if (!_.isEmpty(results)) { cb(null, results[0].def); } else { cb("no results"); } }); };
exports.synsetCosineSimilarity = function (line1, line2) //based on words and their wordnet synsets in the sentence { wordsA = line1.tokenizeAndStem(); //remove stop words, stemming wordsB = line2.tokenizeAndStem(); wordsAsyn = []; wordsBsyn = []; for(var i=0;i<wordsA.length;i++) { var set = wordnet.lookup(wordsA[i], function (results) { console.log(results); return results[0].synonyms; }); console.log(set); if (set) { wordsAsyn.concat(set); } else { wordsAsyn.push(wordsA[i]); } } for (var i = 0; i < wordsB.length; i++) { var set = wordnet.lookup(wordsB[i], function (results) { return results[0].synonyms; }); console.log(set); if (set) { wordsBsyn.concat(synset(wordsB[i])); } else { wordsBsyn.push(wordsB[i]); } } console.log("line A syn"); console.log(wordsAsyn); console.log("line B syn"); console.log(wordsBsyn); return COSINE.textCosineSimilarity(wordsAsyn, wordsBsyn); }
function wordnetquickfetch(seed, callback) { wordnet.lookupSynonyms(seed, function(results) { var output = [] _.each(results, function(value, key, list){ if (value['pos'] = "v") output.push(value['lemma'].split("_").join(" ")) }, this) callback(null, _.unique(output)) }) }
function is_negative(word, prefix) { var re = new RegExp("^" + prefix); word = word.replace(re, ""); var is_negative = false; wordnet.lookup(word, function(results) { console.log(results); is_negative = results.length === 0 ? true : false; return; }); return is_negative; }
lookupWord = function(word, callback) { // define the word wordnet.lookup(word, function(results) { // get the word's stem stem = natural.PorterStemmer.stem(word); // create the data structure d = { word: stem, synonyms: [], speach: null, means: null } pos = []; if (results.length) { // loop through each result _.each(results, function(result) { d.synonyms = _.union(d.synonyms, result.synonyms); pos.push(result.pos); }); // get the most frequent value d.speach = _.chain(pos).countBy().pairs().max(_.last).head().value() || null; } else { // otherwise, try and look it up possib = _.find(worddata.words, function(w) { return _.contains(w.aliases, stem); }); // substitute the correct stuff if (possib) { d.speach = possib.speach; // d.action = possib.action || null; actions.push(possib.action); } } // callback callback(null, d); }); }
wordnet.lookup(word, function(results){ if(results.length>0 ){ wordnet.getSynonyms(results[0], function(results){ results.forEach(function(result){ //console.log(result.lemma); emoji_syn[emoji].push(result.lemma); }); }); } synonyms_count++; if(synonyms_count === emojiarray.length){ callback(); } });
exports.lookupWord = function (word) { stemmer.attach(); console.log('i stemmed the words.'.tokenizeAndStem()); wordnet.lookup(word, function (results) { results.forEach(function (result) { console.log('------------------------------------'); console.log(result.synsetOffset); console.log(result.pos); console.log(result.lemma); console.log(result.synonyms); console.log(result.pos); console.log(result.gloss); }); }); }
var wdlookup = exports.lookup = function (word, pointerSymbol, cb) { var match; var pos = null; pointerSymbol = pointerSymbol || "~"; match = word.match(/~(\w)$/); if (match) { pos = match[1]; word = word.replace(match[0], ""); } var itor = function (word1, next) { wordnet.get(word1.synsetOffset, word1.pos, function (sub) { next(null, sub.lemma); }); }; var synets = []; wordnet.lookup(word, function (results) { results.forEach(function (result) { result.ptrs.forEach(function (part) { if (pos !== null && part.pos === pos && part.pointerSymbol === pointerSymbol) { synets.push(part); } else if (pos === null && part.pointerSymbol === pointerSymbol) { synets.push(part); } }); }); async.map(synets, itor, function (err, items) { items = _.uniq(items); items = items.map(function (x) { return x.replace(/_/g, " "); }); cb(err, items); } ); }); };
function get_synonyms(word, pos, callback) { wordnet.lookup(word, function(results) { var possible_synonyms = []; _.each(results, function(result) { if (result["pos"] !== pos) { return false; } _.each(result["synonyms"], function(synonym) { if (synonym === word) { return false; }; if (_.contains(possible_synonyms, synonym)) { return false; } possible_synonyms.push(synonym); }); }); callback(word, pos, possible_synonyms); }); }
function lookup_next_word () { if (words.length == 0) { return add_topics(); } word = words.shift(); var neither = 0; var noun = 0; var verb = 0; if (word.length > 3) { //console.log("> Word: "+word); if (word.substring(0, word.length-3) == "ing") { verb = 100; classify_word(word, noun, verb, neither); } else if (word.match(/^[0-9]*$/)) { classify_word(word, noun, verb, neither); } else { wordnet.lookup(word, function(results) { results.forEach(function(result) { if (result.pos == "n") { noun++; } else if (result.pos == "v") { verb++; } else if (result.pos == "a" || result.pos == "r" || result.pos == "s") { neither++; } }); classify_word(word, noun, verb, neither); }); return; } } else { save_junk_topic(word); lookup_next_word(); } }
exports.explore = function (word, cb) { var ptrs = []; wordnet.lookup(word, function (results) { for (var i = 0; i < results.length; i++) { ptrs.push(results[i].ptrs); } ptrs = _.uniq(_.flatten(ptrs)); ptrs = _.map(ptrs, function (item) { return { pos: item.pos, sym: item.pointerSymbol }; }); ptrs = _.chain(ptrs) .groupBy("pos") .map(function (value, key) { return { pos: key, ptr: _.uniq(_.map(value, "sym")) }; }) .value(); var itor = function (item, next) { var itor2 = function (ptr, next2) { wdlookup(word + "~" + item.pos, ptr, function (err, res) { // console.log(err); // console.log(word, item.pos, ":", ptr, res.join(", ")); // console.log(res); next2(); }); }; async.map(item.ptr, itor2, next); }; async.each(ptrs, itor, function () { cb(); }); }); };
tokensFromString.forEach(function(currentWord){ //Look up the definition for the token wordnet.lookup(currentWord, function(results) { found = false; count++; results.forEach(function(result) { if(found != true){ if(currentWord != 'a'){ //Determine if the word is a noun if(result.pos == 'n'){ console.log(currentWord.green+' : '+result.pos.green); nounList.push(currentWord); found = true; }else if(result.pos == 'a'){ console.log(currentWord.green+' : '+result.pos.green); found = true; } } } }); if(count == tokensFromString.length){ console.log('Finished getting word definitions'); performSearch(nounList); //Call next function } }); });
console.log('\n-- tfidf for word "Congress" in three test documents:'); console.log('Congress:'); tfidf.tfidfs('Congress', function(i, measure) { console.log('document #' + i + ' is ' + measure); }); console.log('\n-- tfidf for word "taxes" in three test documents:'); console.log('taxes:'); tfidf.tfidfs('taxes', function(i, measure) { console.log('document #' + i + ' is ' + measure); }); var wordnet_data_path = process.env.WORDNET_DATA; console.log("Wordnet data path: " + wordnet_data_path); var wordnet = new natural.WordNet(wordnet_data_path); var pos_map = {v: 'verb', n: 'noun', a: 'adjective', s: 'adjective', r: 'adverb'}; wordnet.lookup('bank', function(results) { results.forEach(function(result) { console.log('\n-- Wordnet data for "bank":'); console.log(' part of speech: ' + pos_map[result.pos]); console.log(' lemma: ' + result.lemma); console.log(' synonyms: ' + result.synonyms); console.log(' gloss: ' + result.gloss); }); });
var natural = require('natural'); var wordnet = new natural.WordNet(); wordnet.lookup('node', function(results) { results.forEach(function(result) { console.log('------------------------------------'); console.log("synsetOffset:"+result.synsetOffset); console.log("pos:"+result.pos); console.log("lemma:"+result.lemma); console.log("synonyms:"+result.synonyms); console.log("gloss:"+result.gloss); }); });
exports.generateData = function() { var natural = require('natural'); var wp = require('wordpos'); var hashes = require('hashes'); var wait = require('wait.for'); var fs = require("fs"); var data = fs.readFileSync("text/ofk.txt").toString(); var tokenizer = new natural.WordTokenizer(); var tokens = tokenizer.tokenize(data); var hashtable = new hashes.HashTable(); var WordPOS = new wp(); tokens.forEach(function (token) { token = token.toLowerCase(); var notSW = natural.stopwords.indexOf(token) == -1; // token = natural.PorterStemmer.stem(token); if (notSW) { var pair = hashtable.get(token); if (pair) { hashtable.add(token,pair.value+1,true); } else { hashtable.add(token,1,true); } } }); var pairs = hashtable.getKeyValuePairs(); pairs.sort(function(pair1,pair2) { return pair2.value - pair1.value; }); console.log(100 * pairs.length/tokens.length + "%"); // Percentage of non stopword nouns in corpus var wn = new natural.WordNet(); var trie = new natural.Trie(); trie.addStrings(tokens); console.log(wn.lookup('node',function(r){return r;})); var used = []; var WordNetter = function(seed,level) { this.name = seed; if(level == 0) return; this.children = []; var netter = this; wn.lookup(this.name,function (results) { if(results.length == 0) { return; } // var result = findMostSynResult(results);c results.forEach(function(result) { if(result.synonyms.length == 0) { return; } result.synonyms.forEach(function (synonym) { if (used.indexOf(synonym) == -1 && trie.contains(synonym) ) { used.push(synonym); netter.children.push( new WordNetter(synonym, level - 1) ); console.log("!"); } }); }); }); function findMostSynResult(results) { var most = results[0]; for (var i = results.length - 1; i >= 0; i--) { if(results[i].synonyms.length > most.synonyms.length) most = results[i]; }; return most; }; }; var data = { name : "root", children : new Array(10) }; for(var i = 0; i < 10; i++) { data.children[i] = new WordNetter(pairs[i].key,3); } // Really hacky way of waiting for asychronous calls to end...HORRIBLE practice, I know setTimeout(function(){ function removeEmpties(data) { if(data.children) { if(data.children.length == 0) { data.children = undefined; } else { for (var i = data.children.length - 1; i >= 0; i--) { removeEmpties(data.children[i]); console.log("&"); }; } } } removeEmpties(data); fs.writeFile("flare.json", JSON.stringify(data,null,'\t'), function (err) { if(err) { console.log(err); } else { console.log("flare.json was saved!"); } }); },10000); };
var itor = function (word1, next) { wordnet.get(word1.synsetOffset, word1.pos, function (sub) { next(null, sub.lemma); }); };