/** * Processes a document and returns a bag-of-words representation. * * @param {string} doc - input document * @returns {Array} sparse word vector */ toVector( doc ) { // Tokenize document after pre-processing... doc = processDocument( doc ); const words = tokenize( doc ); for ( let i = 0; i < words.length; i++ ) { words[ i ] = stemmer( words[ i ] ); } const len = words.length; // Add bigrams: for ( let i = 0; i < len - 1; i++ ) { words.push( words[ i ] + ' ' + words[ i+1 ] ); } // Create and return sparse word vector... const out = new Uint8ClampedArray( this.dim ); for ( let i = 0; i < words.length; i++ ) { const gram = words[ i ]; let idx; if ( this.hashTable.has( gram ) ) { idx = this.hashTable.get( gram ); } else { idx = this.addWord( gram ); } out[ idx ] += 1; } const vec = createVector( out ); return vec; }
/** * Get the stem of a node. * * @param {Node} node - Node to stem. * @return {string} - Stemmed node. */ function stemNode(node) { return stemmer(nlcstToString(node)).toLowerCase(); }