Esempio n. 1
0
	/**
	* Processes a document and returns a bag-of-words representation.
	*
	* @param {string} doc - input document
	* @returns {Array} sparse word vector
	*/
	toVector( doc ) {
		// Tokenize document after pre-processing...
		doc = processDocument( doc );
		const words = tokenize( doc );
		for ( let i = 0; i < words.length; i++ ) {
			words[ i ] = stemmer( words[ i ] );
		}
		const len = words.length;
		// Add bigrams:
		for ( let i = 0; i < len - 1; i++ ) {
			words.push( words[ i ] + ' ' + words[ i+1 ] );
		}

		// Create and return sparse word vector...
		const out = new Uint8ClampedArray( this.dim );
		for ( let i = 0; i < words.length; i++ ) {
			const gram = words[ i ];
			let idx;
			if ( this.hashTable.has( gram ) ) {
				idx = this.hashTable.get( gram );
			}
			else {
				idx = this.addWord( gram );
			}
			out[ idx ] += 1;
		}
		const vec = createVector( out );
		return vec;
	}
Esempio n. 2
0
/**
 * Get the stem of a node.
 *
 * @param {Node} node - Node to stem.
 * @return {string} - Stemmed node.
 */
function stemNode(node) {
    return stemmer(nlcstToString(node)).toLowerCase();
}