Beispiel #1
0
  collect(source) {
    if (source.depth >= this.maxDepth)
      return;

    for (let link of source.links) {
      if (this.knownUrlSet.test(link.key))
        continue;

      let page = {
        pathname: link.pathname,
        penalty: source.penalty + link.penalty,
        depth: source.depth + 1
      };

      let domain = this.domainCache.get(link.host);
      if (domain) {
        let secure = link.protocol === 'https:';
        if (secure !== domain.secure)
          page.secure = secure;
      } else
        domain = this.createDomain(link);

      domain.pages.enq(page);
      this.knownUrlSet.add(link.key);
    }
  }
Beispiel #2
0
  this.check = function (text) {
    var stemmed = this.tag(text);

    for (var i = 0; i < stemmed.length; i ++) {
      if (bloom.test(stemmed[i])) {
        return getSuggested(stemmed[i]);
      }
    }

    return false;
  };
Beispiel #3
0
    .on('data', function (data) {
      var keyFields = data.key.split('~');
      var fieldName = keyFields[4]; //'title' or 'body' for example
      var docID = keyFields[6];
      var bloom = new bf.BloomFilter(JSON.parse(data.value.bloom), 3);
      var bloomFilterTrue = true;
      for (var i = 0; i < otherKeys.length; i++) {
//        console.log('testing bloom for ' + otherKeys[i] + ' is ' + bloom.test(otherKeys[i]));
        bloomFilterTrue = bloom.test(otherKeys[i]);
      }
      var filterSatisfied = true;
      for (var i = 0; i < filterKeySet.length; i++) {
        if (data.value.filters.indexOf(filterKeySet[i]) == -1)
          filterSatisfied = false;
      }
      if (bloomFilterTrue && filterSatisfied) {
//        docsWithBloomAndFilterTrue.push('VECTOR~' + fieldName + '~' + docID + '~');

        //possibly not wise to always return all vectors <- possible
        //performance saving for fielded queries
        docsWithBloomAndFilterTrue.push('VECTOR~*fielded~' + docID + '~');
      }
      return;
    }).on('error', function (err) {
Beispiel #4
0
var Psychonaut = function (options) {
  var natural = require('natural');
  var BloomFilter = require('bloomfilter').BloomFilter;

  natural.PorterStemmer.attach();

  var bloom = new BloomFilter(
    32 * 256, // number of bits to allocate.
    16        // number of hash functions.
  );

  if (!options || (!options.content && typeof options.content !== 'object')) {
    throw new Error('You need content to match keywords from');
  }

  var getSuggested = function (word) {
    return options.content[word] || false;
  };

  var add = function (stemmed) {
    stemmed.forEach(function (word) {
      if (options.content[word]) {
        bloom.add(word);
      }
    });
  };

  this.tag = function (text) {
    var stemmed = text.tokenizeAndStem();

    add(stemmed);

    return stemmed;
  };

  this.check = function (text) {
    var stemmed = this.tag(text);

    for (var i = 0; i < stemmed.length; i ++) {
      if (bloom.test(stemmed[i])) {
        return getSuggested(stemmed[i]);
      }
    }

    return false;
  };
};
Beispiel #5
0
//TODO: clean up confusion between filters and factets
function indexDoc(reverseIndex, docID, doc, facets, callback) {
  //use key if found, if no key is found set filename to be key.
  var fieldBatch = [],
      id = docID,
      facetValues = {},
      fieldKey,
      highestFrequencyCount,
      k,
      deleteKeys,
      facetIndexKey,
      l,
      thisFacetValue,
      m,
      tokenKey,
      docDeleteIndexKey;

  var facety = [];
  for (var i = 0; i < facets.length; i++) {
    //doc has some facet values
    if (doc[facets[i]]) {
      //loop though this facet field
      for (var j = 0; j < doc[facets[i]].length; j++) {
        facety.push(facets[i] + '~' + doc[facets[i]][j]);
      }
    }
  }

  var compositeField = '';
  for (fieldKey in doc) {
    if( Object.prototype.toString.call(doc[fieldKey]) === '[object Array]' ) {
      if (facets.indexOf(fieldKey) != -1) {
        facetValues[fieldKey] = doc[fieldKey];
      }
    }
    //throw away fields that have null value
    else if (doc[fieldKey] == null) {
      delete doc[fieldKey];
      console.log('[indexing warning] '.yellow + docID.yellow + ': '.yellow
                  + fieldKey.yellow + ' field is null, SKIPPING'.yellow)
    }
    //only index fields that are strings
    else if ((typeof doc[fieldKey]) != 'string') {
      delete doc[fieldKey];
      console.log('[indexing warning] '.yellow + docID.yellow + ': '.yellow
                  + fieldKey.yellow 
                  + ' field not string or array, SKIPPING'.yellow)
    }
    else {
      //field is OK- add it to forage.composite
      compositeField += doc[fieldKey] + ' ';
    }
  }

  doc['*'] = compositeField;
  var fieldedVector = {};

  var tfValues = [];

  for (fieldKey in doc) {
    var reverseIndexValue = {};
    reverseIndexValue['filters'] = facety;
    var tfmap = {};
    tfidf = new TfIdf();
    tfidf.addDocument(doc[fieldKey], fieldKey + '~' + id);
    var docVector = tfidf.documents[tfidf.documents.length - 1];
    highestFrequencyCount = 0;
    for (k in docVector) {
      if (docVector[k] > highestFrequencyCount) highestFrequencyCount = docVector[k];
      if (fieldKey == '*') tfValues.push(k);
    }
    deleteKeys = [];



    //generate bloom filter
    var p = 0.1; //bloomErrorMargin
    var n = (Object.keys(docVector).length) //numberOfItemsInBloomFilter
    var bloomBits = Math.ceil((n * Math.log(p)) / Math.log(1.0 / (Math.pow(2.0, Math.log(2.0)))));
    var bloomHashFunctions = Math.round(Math.log(2.0) * bloomBits / n);
    var bloom = new bf.BloomFilter(bloomBits, bloomHashFunctions);
    //work out facet keys for bloom filter
    for (var i = 0; i < facets.length; i++) {
      //doc has some facet values
      if (doc[facets[i]]) {
        //loop though this facet field
        for (var j = 0; j < doc[facets[i]].length; j++) {
          for (var k in docVector) {
            if (k != '__key'){
              var bloomEntry = 'TF~' + k + '~' + facets[i] + '~'
                + doc[facets[i]][j] + '~' + fieldKey;
              bloom.add(bloomEntry);
            }
          }
        }
      }
    }
    //no facets
    for (var k in docVector) {
      if (k != '__key') {
        var bloomEntry = 'TF~' + k + '~~~' + fieldKey;
        bloom.add(bloomEntry);
      }
    }
    var bloomArray = [].slice.call(bloom.buckets)
    reverseIndexValue['bloom'] = JSON.stringify(bloomArray);

    //wildcard token
    docVector['*'] = 1;

    var docVecLength = Object.keys(docVector).length - 2;
    for (k in docVector) {
      if (k != '__key') {
        //no faceting
        facetIndexKey = ['~'];
        for (l = 0; l < facets.length; l++) {
          if (doc[facets[l]]) {
            thisFacetValue = doc[facets[l]];
            for (m = 0; m < thisFacetValue.length; m++) {
              facetIndexKey.push(facets[l] + '~' + thisFacetValue[m]);
            }
          } 
        }
        for (l = 0; l < facetIndexKey.length; l++) {
          //augmented term frequency
          var tf = (docVector[k] / highestFrequencyCount / docVecLength).toFixed(10);
          //since levelDB sorts fastest from low to high, it is best
          //to give significant keys (those with highest tf) a lower
          //score. An inverted TF is therefore used for sorting.
          var sortKey = (1 - tf).toFixed(10);
          //design key
          var tokenKey = 'REVERSEINDEX~'
            + k + '~'
            + facetIndexKey[l] + '~'
            + fieldKey + '~'
            + sortKey + '~'
//            + tf + '~'
            + id;
          //make a tf vector for the whole document
          //if (fieldKey == 'forage.composite')
          tfmap[k] = tf;
          fieldBatch.push({
            type: 'put',
            key: tokenKey,
            value: reverseIndexValue});
          deleteKeys.push(tokenKey);
//          console.log(tokenKey);
        }
      }
    }
    //dump references so that docs can be deleted
    docDeleteIndexKey = 'DELETE-DOCUMENT~' + id + '~' + fieldKey;
    deleteKeys.push(docDeleteIndexKey);
    fieldBatch.push({
      type: 'put',
      key: docDeleteIndexKey,
      value: deleteKeys});
    var vectorValue = {};
    vectorValue['vector'] = tfmap;
    vectorValue['facetValues'] = facetValues;
    fieldBatch.push({
      type: 'put',
      key: 'VECTOR~' + fieldKey + '~' + docID + '~',
      value: vectorValue});
    fieldedVector[fieldKey] = vectorValue;
  }
  //generate fielded document vector for weighting

  fieldBatch.push({
    type: 'put',
    key: 'VECTOR~*fielded~' + docID + '~',
    value: fieldedVector});
  //document
  fieldBatch.push({
    type: 'put',
    key: 'DOCUMENT~' + docID + '~',
    value: JSON.stringify(doc)});

  //put key-values into database
  reverseIndex.batch(fieldBatch, function (err) {
//    console.log(tfValues);
    var msg = {};
    msg['status'] = '[indexed] ' + docID;
    msg['tfValues'] = tfValues;
    callback(msg);
    if (err) return console.log('Ooops!', err);
    return;
  });
}
Beispiel #6
0
if (!partitions) {
  process.exit(1);
}

// Read in.
var hosts = fs.readFileSync(source_file).toString().split("\n");

// Allocate n buckets.
var buckets = [];
for (var i = 0; i < partitions; i++) {
  buckets[i] = [];
}

// Create bloom filter for filtering at /24's.
var BloomFilter = require('bloomfilter').BloomFilter;
var bloom = new BloomFilter(256 * 256 * 256, 8);
var adds = 0;

// Shuffle into buckets.
for (var i = 0; i < hosts.length; i++) {
  var prefix = hosts[i].substr(0, hosts[i].lastIndexOf('.'));
  if (!bloom.test(prefix)) {
    buckets[Math.floor(Math.random()*partitions)].push(hosts[i]);
    bloom.add(prefix);
    adds += 1;
  }
}

// Mkdir if not there
if (!fs.existsSync(dest_dir)) {
  fs.mkdirSync(dest_dir);
Beispiel #7
0
 stemmed.forEach(function (word) {
   if (options.content[word]) {
     bloom.add(word);
   }
 });
Beispiel #8
0
 markAsKnown(url) {
   let key = utils.normalizeUrl(url);
   this.knownUrlSet.add(key);
 }