Exemplo n.º 1
0
var crawlSite = function(siteName, callback){
  var siteUrl = url.parse(siteName);

  _crawler = new Crawler();
  _crawler.host = siteUrl.hostname;
  _crawler.interval = config.crawler.interval;
  _crawler.userAgent = config.crawler.userAgent;

  _crawler.on("crawlstart", function(){
    console.log("crawlstart");
    _self.emit(CloudCrawlerState.running);
    _urlQueue.writeStart();
  });

  _crawler.on("queueadd", function(queueItem){
    console.log("queueadd");
    _self.emit(CloudCrawlerState.urlProcessed, queueItem.url);
    _urlQueue.writeUrl(queueItem.url);
  });

  _crawler.on("complete", function(){
    console.log("complete");
    _urlQueue.writeStop();
    return callback();
  });

  _self.emit(CloudCrawlerState.ready);
};
Exemplo n.º 2
0
  var crawlStore = function(storeName, callback){

    var store = stores.getStoreFromName(storeName);

    crawler = new Crawler();
    crawler.host = store.host;
    crawler.interval = config.crawler.interval;
    crawler.userAgent = config.crawler.userAgent;
    crawler.addFetchCondition(store.fetchConditions);

    crawler.on("crawlstart", function(){
      urlQueue.writeStart();
    });
    crawler.on("queueadd", function(queueItem){
      console.log("Added: " + queueItem.url);
      urlQueue.writeUrl(queueItem.url);
    });

    crawler.on("complete", function(){
      urlQueue.writeStop();
      callback();
    });

    crawler.start();
  };
Exemplo n.º 3
0
var Crawler = require("simplecrawler").Crawler
  , fs = require('fs')
  , crawl = new Crawler('www.npr.org');
crawl.on("fetchcomplete",function(queueItem, responseBuffer, response) {
    console.log("I just received %s (%d bytes)",queueItem.url,responseBuffer.length);
    // console.log("It was a resourc of type %s",response.headers['content-type']);
    fs.appendFile('urls.txt',queueItem.url + '\n');
});
crawl.start();
Exemplo n.º 4
0
var downloadSite = function (domain, callback) {

    // Where to save downloaded data
    //var outputDirectory = "D:/webdata" + '/' + domain
    var myCrawler = new Crawler(domain)
    myCrawler.interval = 250
    myCrawler.maxConcurrency = 5

    myCrawler.on("fetchcomplete", function (queueItem, responseBuffer, response) {
        if ((queueItem.url.indexOf('.html') != -1) || (queueItem.url.indexOf('.htm') != -1)) {
            var s='';
            if (charset== 'gb2312'){
                s=iconv.decode(responseBuffer, 'gb2312');
              }
              else{
                s=responseBuffer.toString();
            };
//            var s =iconv.decode(responseBuffer, 'utf8');//responseBuffer.toString(); //iconv.decode(responseBuffer, 'gb2312');//iconv.decode(responseBuffer, 'utf8');
            console.log(queueItem.url);
            console.log(s);
            var l={};
            l.url=queueItem.url;
            l.content=s;
            var $ = cheerio.load(s);
            var path = 'title';
            var title = $(path).text();
            console.log('title:'+title);
            l.title=title;
            l.category=site;
            db.sites.save(l);
        }
//        {
//            // Parse url
//            var parsed = url.parse(queueItem.url)
//
//            // Rename / to index.html
//            if (parsed.pathname === '/')
//                parsed.pathname = '/index.html'
//
//            // Get directory name in order to create any nested dirs
//            var dirname = outputDirectory + parsed.pathname.replace(/\/[^\/]+$/, '')
//
//            // Path to save file
//            var filepath = outputDirectory + parsed.pathname
//
//            // Check if DIR exists
//            fs.exists(dirname, function (exists) {
//
//                // If DIR exists, write file
//                if (exists) {
//                    //fs.writeFile(filepath, responseBuffer, function () {});
//                    var s = iconv.decode(responseBuffer, 'gb2312');
//                    console.log(s);
//                }
//                // Else, recursively create dir using node-fs, then write file
//                else
//                    fs.mkdir(dirname, 0755, true, function (err) {
//                        //fs.writeFile(filepath, responseBuffer, function () {});
//                        var s = iconv.decode(responseBuffer, 'gb2312');
//                        console.log(s);
//                    })
//
//            })
//
//            console.log("I just received %s (%d bytes)", queueItem.url, responseBuffer.length)
//            console.log("It was a resource of type %s", response.headers['content-type'])
//        }
    })

    // Fire callback
    myCrawler.on('complete', function () {
        callback()
    })

    // Start Crawl
    myCrawler.start()

}
Exemplo n.º 5
0
  trap: function (ghostDomain, ghostPort, staticDirectory, staticWebAddress, callback, debug) {
    // check function params are valid
    if (typeof ghostDomain !== 'string') {
      callback(new TypeError('`ghostDomain` must be a string'));
      return;
    }
    if (typeof staticDirectory !== 'string') {
      callback(new TypeError('`staticDirectory` must be a string'));
      return;
    }
    if (typeof staticWebAddress !== 'string') {
      callback(new TypeError('`staticWebAddress` must be a string'));
      return;
    }


    // configure the crawler
    var crawler = new Crawler(ghostDomain);
    crawler.initialPort = ghostPort;
    crawler.stripQuerystring = true;

    // Make sure the user knows that we're doing something
    var spinner = new Spinner('"Don\'t look directly at the trap!" Collecting Ghost now… %s');
    spinner.setSpinnerString('|/-\\');

    // Set to crawl entire domain
    crawler.queue.add('http', ghostDomain, ghostPort, '/');
    // force crawl robots.txt
    crawler.queue.add('http', ghostDomain, ghostPort, '/robots.txt');


    // When a file is received, save file in the appropriate place
    crawler.on('fetchcomplete', function (queueItem, responseBuffer, response) {
      // Parse url
      var parsed = url.parse(queueItem.url);

      // Rename / to index.html
      if (parsed.pathname.slice(-1) === '/') {
        parsed.pathname += 'index.html';
      }

      // Get directory name in order to create any nested dirs
      var dirname = staticDirectory + parsed.pathname.replace(/\/[^\/]+$/, '');

      // Path to save file
      var filepath = staticDirectory + parsed.pathname;

      // Check if DIR exists
      fs.exists(dirname, function (exists) {

        // If DIR exists, write file
        if (exists) {
          fs.writeFile(filepath, responseBuffer, function () {
            debug(sprintf('writeFile %s', filepath));
          });

        // Else, recursively create dir using node-fs, then write file
        } else {
          fs.mkdir(dirname, '0755', true, function () {
            debug(sprintf('mkdir %s', dirname));

            fs.writeFile(filepath, responseBuffer, function () {
              debug(sprintf('writeFile %s', filepath));
            });
          });
        }
      });

      debug(sprintf('fetchcomplete %s, %d bytes, %s, %s', parsed.pathname, responseBuffer.length, filepath, response.headers['content-type']));
    });


    // error out if the local ghostDomain server isn't running
    crawler.on('fetchclienterror', function () {
      // stop crawling
      this.stop();
      spinner.stop();

      callback(new Error('A Ghost instance at http://' + ghostDomain + ' is not currently running'));
    });


    // return success message on complete
    crawler.on('complete', function () {
      // Replace the old domain with the new one
      strReplace({
        regex: ghostDomain + ':' + ghostPort,
        replacement: staticWebAddress,
        paths: [staticDirectory],
        recursive: true,
        silent: true
      });

      spinner.stop();
      callback('success');
    });

    // start crawling
    crawler.start();
    spinner.start();
  },