Exemplo n.º 1
0
router.get('/', function(req, res) {

    var myCrawler = new Crawler(initialUrl, initialPath);

    myCrawler.downloadUnsupported = false;
    myCrawler.fetchWhitelistedMimeTypesBelowMaxDepth = false; // PREVENTS RESSOURCES DOWNLOAD
    myCrawler.filterByDomain = true;
    myCrawler.interval = 10000; // Ten seconds
    myCrawler.maxConcurrency = 1;
    myCrawler.maxDepth = 1;
    myCrawler.parseHTMLComments = false;
    myCrawler.parseScriptTags = false;

    // myCrawler.addFetchCondition(function(parsedURL, queueItem) {
    //     return !parsedURL.path.match(/\.pdf$/i);
    // });

    myCrawler.on('fetchcomplete', function(queueItem, responseBuffer, response) {
        console.log('I just received %s (%d bytes)', queueItem.url, responseBuffer.length);
        console.log('It was a resource of type %s', response.headers['content-type']);
        // Do something with the data in responseBuffer
        pagesArr.push(responseBuffer);
    });

    /*
    myCrawler.on('complete' function() {
        var myJSON = pagesArr;
        res.json(myJSON);
        res.end();
    });
    */

    myCrawler.start();

});
connection.query("SELECT profile_uri FROM `transfermarkt_team` WHERE profile_uri IS NOT NULL AND id IN ('2779','44247','26099','11952','3561')", function(err,rows) {
    if (err) throw err;
    for (var i = rows.length - 1; i >= 0; i--) {
    	crawler.queueURL(host + rows[i].profile_uri);
    };
    crawler.start();
});
Exemplo n.º 3
0
connection.query("SELECT profile_uri FROM `transfermarkt_team` WHERE profile_uri IS NOT NULL AND owner_id = 0", function(err,rows) {
    if (err) throw err;
    for (var i = rows.length - 1; i >= 0; i--) {
    	crawler.queueURL(host + rows[i].profile_uri);
    };
    crawler.start();
});
Exemplo n.º 4
0
// Functions
function initializeCrawler(options) {
    // Create a crawler and set up some options
    myCrawler.host = options.server;
    myCrawler.interval = options.interval;
    myCrawler.maxConcurrency = options.concurrency;
    myCrawler.initialPort = options.port;
    myCrawler.initialPath = options.path;
    myCrawler.initialProtocol = options.protocol;

    // Ignore resources that we don't care about
    var conditionID = myCrawler.addFetchCondition(function(parsedURL) {
        var url = parsedURL.uriPath,
            i,
            ignore = [
                'css', 'js',
                'jpg', 'jpeg', 'png', 'gif', 'bmp',
                'pdf', 'doc', 'docx', 'ppt', 'pptx', 'zip'
            ];

        for (i = 0; i < ignore.length; i += 1) {
            if (url.indexOf(ignore[i], url.length - ignore[i].length) !== -1) { return false }
        }
        return true;
    });    

    myCrawler.start();
}
connection.query("SELECT profile_uri FROM `transfermarkt_player` WHERE height < 100 AND profile_uri IS NOT NULL ORDER BY id DESC", function(err,rows) {
    if (err) throw err;
    for (var i = rows.length - 1; i >= 0; i--) {
    	crawler.queueURL(host + rows[i].profile_uri);
    };
    crawler.start();
});
	connection.query("SELECT profile_uri FROM `transfermarkt_team` WHERE profile_uri IS NOT NULL AND id IN (SELECT DISTINCT team_id FROM transfermarket_team_player)", function(err,rows) {
	    if (err) throw err;
	    for (var i = rows.length - 1; i >= 0; i--) {
	    	crawler.queueURL(host + rows[i].profile_uri);
	    };
	    connection.release();
	    crawler.start();
	});
	connection.query("SELECT profile_uri FROM `transfermarkt_player` WHERE date_of_birth = '0000-00-00' AND profile_uri IS NOT NULL ORDER BY id ASC", function(err,rows) {
	    if (err) throw err;
	    for (var i = rows.length - 1; i >= 0; i--) {
	    	crawler.queueURL(host + rows[i].profile_uri);
	    };
	    connection.release();
	    crawler.start();
	});
	connection.query("SELECT DISTINCT releasing_team_id FROM transfermarket_transfer WHERE releasing_team_id NOT IN (SELECT id FROM `transfermarket_team`)", function(err,rows) {
	    if (err) throw err;
	    connection.release();
	    for (var i = rows.length - 1; i >= 0; i--) {
	    	crawler.queueURL(host + '/arsenal-fc/startseite/verein/' + rows[i].releasing_team_id);
	    };
	    crawler.start();
	});
	connection.query("SELECT profile_uri FROM transfermarket_team", function(err,rows) {
	    if (err) throw err;
	    connection.release();
	    for (var i = rows.length - 1; i >= 0; i--) {
		    var path = rows[i].profile_uri;
		    path = path.replace(/(^\/\S+?\/startseite\/verein\/\d+?)(\/saison_id\/\d{4})?$/,'$1')
	    	crawler.queueURL(host + path);
	    };
	    crawler.start();
	});
Exemplo n.º 10
0
function runCrawler(phantom) {
  crawler.start();
  crawler.on("queueadd", function(queueItem) {
    if (!queueItem.url.match(phantomBannedExtensions)) {
      var resume = this.wait();
      phantomQueue.push(queueItem.url);
      processQueue(phantom, resume);
    }
  });
}
	connection.query("SELECT transfermarket_competition.uri FROM `competition` JOIN `nation` ON competition.nation_id = nation.id JOIN `transfermarket_nation` ON nation.full_name = transfermarket_nation.name JOIN `transfermarket_competition` ON transfermarket_competition.nation_id = transfermarket_nation.id WHERE transfermarket_competition.competition_name IN (SELECT name FROM `competition`)", function(err,rows) {
	    if (err) throw err;
	    connection.release();
	    for (var i = rows.length - 1; i >= 0; i--) {
		    var path = rows[i].uri;
		    path = path.replace('startseite','gesamtspielplan');
	    	crawler.queueURL(host + path);
	    };
	    crawler.start();
	});
Exemplo n.º 12
0
      return new Promise((resolve) => {
        const redirectedUrls = new Set(),
          url = urlParser.parse(message.url),
          crawler = new Crawler(url.hostname, url.path, url.port);

        let pageCount = 1; // First page is start url

        if (url.protocol) {
          // Node's url parser includes a : at the end of protocol, simplecrawler expects no :.
          crawler.initialProtocol = url.protocol.slice(0, -1);
        }

        crawler.maxDepth = this.options.depth;
        crawler.downloadUnsupported = false;
        crawler.allowInitialDomainChange = true;
        crawler.parseHTMLComments = false;
        crawler.addFetchCondition(function(parsedURL) {
          const extension = path.extname(parsedURL.path);
          // Don't try to download these, based on file name.
          return ['png', 'jpg', 'gif', 'pdf'].indexOf(extension) === -1;
        });

        crawler.on('fetchredirect', (queueItem, parsedURL, response) => {
          redirectedUrls.add(response.headers.location);
        });

        crawler.on('fetchcomplete', (queueItem) => {
          const pageMimeType = /^(text|application)\/x?html/i;

          const url = queueItem.url;
          if (redirectedUrls.has(url)) {
            log.verbose('Crawler skipping redirected URL %s', url);
          } else if (message.url === url) {
            log.verbose('Crawler skipping initial URL %s', url);
          } else if (pageMimeType.test(queueItem.stateData.contentType)) {
            log.verbose('Crawler found URL %s', url);
            queue.postMessage(make('url', {}, {url}));
            pageCount++;

            if (pageCount >= maxPages) {
              log.verbose('Crawler stopped after %d urls', pageCount);
              crawler.stop();
              return resolve();
            }
          } else {
            log.verbose('Crawler found non html URL %s', url);
          }
        });

        crawler.on('complete', resolve);

        log.debug('Starting to crawl from ' + message.url + ' with max depth ' + crawler.depth +
          ' and max count ' + maxPages);
        crawler.start();
      })
Exemplo n.º 13
0
function crawl() {
  var crawler = new Crawler("localhost", "/", server.address().port);

  crawler.discoverResources = function(resourceData, queueItem) {
    var resources = Crawler.prototype.discoverResources.apply(this, [
      resourceData,
      queueItem
    ]);
    var unicodeResourceData = resourceData.toString("utf8");

    URL_FINDERS.forEach(function(findURLs) {
      resources.push.apply(resources, findURLs(
        unicodeResourceData,
        queueItem.url
      ));
    });

    return resources.map(function(url) {
      // It's possible that some of our URLs might be absolute URLs
      // based on config.ORIGIN. However, because the server we've
      // started for crawling is based at a dynamic origin on localhost,
      // we need to rebase such URLs to be at our dynamic origin, so
      // that they're actually spidered.
      if (url.indexOf(config.ORIGIN) === 0) {
        return urlResolve(queueItem.url, url.replace(config.ORIGIN, ''));
      }
      return url;
    });
  };

  crawler.on('complete', function() {

    var notfound = crawler.queue.filter(function(item, listPosition) {
      return (item.status === 'notfound');
    });

    notfound.forEach(function(item) {
      console.log("Couldn't find " + chalk.bold.red(item.path) + " referenced by " + urlParse(item.referrer).path + ".");
    });

    server.close();

    if (notfound.length) {
      console.log("Alas, some files could not be found.");
      process.exit(1);
    } else {
      console.log("Fetched " + chalk.bold.green(crawler.queue.length.toString()) + " URLs without encountering any 404s.");
    }
  });

  crawler.interval = 0;
  crawler.parseScriptTags = false;
  crawler.start();
}
			connection.query(sql, function(err,rows) {
			    if (err) throw err;
			    for (var i = rows.length - 1; i >= 0; i--) {
				    var path = rows[i].profile_uri;
				    path = path.replace('profil','korrektur');
			    	crawler.queueURL(host + path);
			    };
			    connection.release();
			    crawler.start();
			    players_id = [];
			});
excute("SELECT uri FROM `transfermarket_competition` WHERE competition_ref_id != 0").then(function(rows) {
	if(rows.length){
		rows.forEach(function(row){
			var path = row.uri;
		    path = path.replace('startseite','gesamtspielplan');
			console.log(host + path);
	    	crawler.queueURL(host + path);
		})
    	crawler.start();
	}
});
Exemplo n.º 16
0
 $('#leftcontent1 div.markframe table.billsresult a').each(function(idx) {
     var href = $(this).attr('href');
     if (href.match(/topical_nature\/\d+/)) {
         scraper.queue.add(
             'http',
             'www.parliament.bg',
             80,
             '/bg/topical_nature/' + href.split('/').reverse()[0]
         );
         if (!scraper_started) {
             scraper_started = true;
             scraper.start();
         }
     }
 });
Exemplo n.º 17
0
  exports.getSitemap = function(req, res) {
    // var testCrawler = crawler.getLinksFromUrl();
    var Crawler = require('simplecrawler');
    var myCrawler = new Crawler('www.zentorrents.com');
    myCrawler.initialProtocol = 'http';
    myCrawler.initialPort = 80;
    myCrawler.maxConcurrency = 1;
    myCrawler.interval = 120;
    myCrawler.timeout = 1000;
    myCrawler.userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36';
    myCrawler.filterByDomain = true;
    myCrawler.initialPath = '/series';
    // var conditionID = myCrawler.addFetchCondition(function(parsedURL, queueItem) {
    //   if (parsedURL.path.match('publicidad-sem')) {
    //     return true;
    //   } else {
    //     return false;
    //   }
    // });
    var urls = [];
    var i = 0;
    myCrawler.on("fetchcomplete", function(link) {
      if (link.url.match('series/')) {
        urls.push(link.url);
        i++;
        console.log('URL:', i, link.url);
      }

      // res.send(link);
    });

    myCrawler.on('queueerror', function(err) {
      console.log('error', err);
    });

    myCrawler.on('complete', function() {
      res.send(urls);
    });
    myCrawler.start();
  };
});

function parseResponse(queueItem, responseBuffer) {
  let $ = cheerio.load(responseBuffer);
  let recipe = {};
  recipe.url = queueItem.url;
  recipe.name = $('.recipe-title h1').text().trim();
  recipe.image = $('.photo.pic.u-photo').attr('src');
  recipe.ingredients = [];
  $('.p-ingredient').each((i, element) => {
    recipe.ingredients.push($(element).text());
  });
  recipe.instructions = [];
  $('.instructions.e-instructions li').each((i, element) => {
    recipe.instructions.push($(element).text());
  });
  recipe.yield = $('.p-yield.num yield').attr('value');
  recipe.preptime = $('.dt-duration').attr('datetime');
  return recipe;
}

myCrawler.on('fetchcomplete', (queueItem, responseBuffer, response) => {
  if (!re.test(queueItem.url)) {
    return;
  }
  let recipe = parseResponse(queueItem, responseBuffer);
  console.log(recipe);
});

myCrawler.start();
Exemplo n.º 19
0
connection.query('SELECT uri FROM error_uri ORDER BY RAND() LIMIT 1', function(err,rows) {
    if (err) throw err;
    crawler.initialPath = rows[0].uri;
    crawler.start();
});
Exemplo n.º 20
0
      return new Promise(resolve => {
        const redirectedUrls = new Set(),
          crawler = new Crawler(message.url);

        let pageCount = 1; // First page is start url

        crawler.maxDepth = this.options.depth;
        crawler.downloadUnsupported = false;
        crawler.allowInitialDomainChange = true;
        crawler.parseHTMLComments = false;
        crawler.addFetchCondition(function(parsedURL) {
          const extension = path.extname(parsedURL.path);
          // Don't try to download these, based on file name.
          return ['png', 'jpg', 'gif', 'pdf'].indexOf(extension) === -1;
        });

        if (this.basicAuth) {
          const userAndPassword = this.basicAuth.split('@');
          crawler.needsAuth = true;
          crawler.authUser = userAndPassword[0];
          crawler.authPass = userAndPassword[1];
        }

        crawler.on('fetchredirect', (queueItem, parsedURL, response) => {
          redirectedUrls.add(response.headers.location);
        });

        crawler.on('fetchcomplete', queueItem => {
          const pageMimeType = /^(text|application)\/x?html/i;

          const url = queueItem.url;
          if (redirectedUrls.has(url)) {
            log.verbose('Crawler skipping redirected URL %s', url);
          } else if (message.url === url) {
            log.verbose('Crawler skipping initial URL %s', url);
          } else if (pageMimeType.test(queueItem.stateData.contentType)) {
            log.verbose('Crawler found %s URL %s', pageCount, url);
            queue.postMessage(make('url', {}, { url, group: message.group }));
            pageCount++;

            if (pageCount >= maxPages) {
              log.info('Crawler stopped after %d urls', pageCount);
              crawler.stop();
              return resolve();
            }
          } else {
            log.verbose('Crawler found non html URL %s', url);
          }
        });

        crawler.on('complete', resolve);

        log.info(
          'Starting to crawl from ' +
            message.url +
            ' with max depth ' +
            crawler.maxDepth +
            ' and max count ' +
            maxPages
        );
        crawler.start();
      });
Exemplo n.º 21
0
  grunt.registerMultiTask('link-checker', 'Checks your site for broken links after a build.', function () {

    var done = this.async();
    var options = this.options();
    var errors = false;
    var site = this.data.site;

    grunt.log.ok('Checking for broken links at: ' + site + (options.initialPort ? ':' + options.initialPort : ''));
    var crawler = new Crawler(site);

    Object.keys(options).forEach(function(key) {
      crawler[key] = options[key];
    });
    crawler
      .on('fetch404',function(queueItem, response) {
        errors = true;
        grunt.log.error('Resource not found linked from ' + queueItem.referrer.cyan + ' to', queueItem.url.magenta);
        grunt.log.error('Status code: ' + response.statusCode);
      })
      .on('fetcherror', function(queueItem, response) {
        errors = true;
        grunt.log.error('Trouble fetching the following resource linked from ' + queueItem.referrer.cyan + ' to', queueItem.url.magenta);
        grunt.log.error('Status code: ' + response.statusCode);
      })
      .on('fetchtimeout', function(queueItem) {
        errors = true;
        grunt.log.error('Timeout fetching the following resource linked from ' + queueItem.referrer.cyan + ' to', queueItem.url.magenta);
      })
      .on('fetchclienterror', function(queueItem) {
        errors = true;
        if (!queueItem.referrer) {
          return grunt.log.error('Error fetching `site` URL: ' + queueItem.url.magenta);
        }
        grunt.log.error('Client error fetching the following resource linked from ' + queueItem.referrer ? queueItem.referrer.cyan : site + ' to', queueItem.url.magenta);
      })
      .on('complete', function() {
        if (!errors) {
          grunt.log.ok('No broken links found at: ' + site + (options.initialPort ? ':' + options.initialPort : ''));
        }
        done(!errors);
      })
      .on('fetchcomplete', function(queueItem, responseBuffer) {
        grunt.log.debug('Fetched: ' + queueItem.url);
        if (options.noFragment) {
          return;
        }
        var html = responseBuffer.toString();
        var $ = cheerio.load(html);

        $('a[href*="#"]').each(function(i, anchor) {
          crawler.queueURL($(anchor).attr('href'), queueItem);
        });

        if (queueItem.url.indexOf('#') !== -1) {
          try {
            if ($(queueItem.url.slice(queueItem.url.indexOf('#'))).length === 0) {
              grunt.log.error('Error finding content with the following fragment identifier linked from ' + queueItem.referrer.cyan  + ' to', queueItem.url.magenta);
              errors = true;
            }
          } catch (e) {
            grunt.log.error('The following URL was formatted incorrectly linked from ' + queueItem.referrer.cyan  + ' to', queueItem.url.magenta);
            errors = true;
          }
        }
      });
    if (options.callback) options.callback(crawler);
    crawler.start();
  });
			    connection.release();
			    crawler.start();
			    players_id = [];
			});
		});
	}
}).on('fetcherror',function(queueItem, response){
	crawler.queueURL(host + queueItem.path);
}).on('fetchtimeout',function(queueItem, response){
	crawler.queueURL(host + queueItem.path);
}).on('fetchclienterror',function(queueItem, response){
	crawler.queueURL(host + queueItem.path);
});
crawler.queueURL(host + '/statistik/letztetransfers');
crawler.queueURL(host + '/statistik/letztetransfers?page=2');
crawler.start();
/*pool.getConnection(function(err, connection) {
	connection.query("SELECT profile_uri FROM transfermarket_player WHERE id NOT IN (SELECT DISTINCT player_id FROM `transfermarket_transfer`) ORDER BY id DESC LIMIT 100 OFFSET 0", function(err,rows) {
	    if (err) throw err;
	    for (var i = rows.length - 1; i >= 0; i--) {
		    var path = rows[i].profile_uri;
		    path = path.replace('profil','korrektur');
	    	
	    	crawler.queueURL(host + path);
	    };
	    connection.release();
	    
	    crawler.start();
	});
});
pool.getConnection(function(err, connection) {
Exemplo n.º 23
0
Mapper.prototype.newCrawler = function (site) {
  var mapper = this;
  // logger.info(site);
  // Config conditions

  if (site._id === undefined) {
    throw new Error("Scan ID required");
  }

  if (site.host === undefined) {
    throw new Error("target site undefined");
  }

  // Create the crawler
  var crawler = new Crawler(site.host);

  crawler.site                = site; // Stash this for later
  crawler.stripQuerystring    = true;
  crawler.maxConcurrency      = 5;
  // crawler.interval            = 6000;
  crawler.timeout             = 30000;

  // SAVE TO DISK LIKE A BOSS
  // crawler.cache = new Crawler.cache('foobar');

  // Exclude things that we don't want
  // In the future we will use the config for this
  var noJS = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.js$/i);
  });

  var noCSS = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.css$/i);
  });

  var noPNG = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.png$/i);
  });

  var noJPG = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.jpg$/i);
  });

  var noKML = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.kml$/i);
  });

  var noMovie = crawler.addFetchCondition(function(parsedURL) {
      return !parsedURL.path.match(/\.mp4$/i);
  });

  crawler.on("fetchcomplete",function(queueItem, responseBuffer, response) {
    var $content = cheerio.load(responseBuffer.toString());
    var title = $content('title').html();

    mapper.postal.publish({
      channel: 'Pages',
      topic: 'crawled',
      data: {
        queueItem: queueItem,
        url: queueItem.url,
        page: responseBuffer.toString(),
        title: title,
        sitescan_id: crawler.site._id
      }
    });
  });

  crawler.site.status = 1;
  mapper.postal.publish({
    channel: 'Sites',
    topic: 'started',
    data: crawler.site
  });

  mapper.crawlers.push(crawler);
  crawler.start();

  crawler.on("complete", function() {
    logger.info("Finished crawling %s", crawler.host);
    crawler.site.status = 2; // She's done and we'll notify home next round of updates
  });
};
Exemplo n.º 24
0
app.post('/search', function(req, res) {

	var url,path,pathArray;
	var matchedUrl = [];

	if(req.body.url.indexOf('.com') > -1){
		url = req.body.url.slice(7,req.body.url.indexOf('.com' )+4)
		path = req.body.url.slice(req.body.url.indexOf('.com')+5)
	} else if (req.body.url.indexOf('.edu') > -1){
		url = req.body.url.slice(7,req.body.url.indexOf('.edu' )+4)
		path = req.body.url.slice(req.body.url.indexOf('.edu')+5)
	} else if (req.body.url.indexOf('.org') > -1){
		url = req.body.url.slice(7,req.body.url.indexOf('.org' )+4)
		path = req.body.url.slice(req.body.url.indexOf('.org')+5)
	}


	if(path.indexOf('/')==-1){
		pathArray = path.split('&')
	} else {
		pathArray = path.split('/')
	}

	if(pathArray[pathArray.length-1] == ''){
		pathArray.splice(pathArray.length-1 , 1)
	}

	console.log(url)
	console.log(pathArray) 


	
  	var eventCrawler = new Crawler(url);

  	eventCrawler.addFetchCondition(function(parsedURL){
  		for(var i = 0 ; i < pathArray.length ; i ++){
  			return (parsedURL.path.indexOf(pathArray[i]) > -1)
  		}
  	})

  	eventCrawler.addFetchCondition(function(parsedURL) {
    		return !parsedURL.path.match(/\.jpg$/i);
	})

	eventCrawler.addFetchCondition(function(parsedURL) {
    		return !parsedURL.path.match(/\.png$/i);
	})

	eventCrawler.addFetchCondition(function(parsedURL) {
    		return !parsedURL.path.match(/\.gif$/i);
	})


  	eventCrawler.on("fetchcomplete" , function(queueItem){
  		console.log("Completed fetching resource:",queueItem.url); 
  		matchedUrl.push(queueItem.url)
  	})


  	var popPath = pathArray[pathArray.length-2];
  	var popPathTwo = pathArray[pathArray.length-1]
  	var referenceCheckOne = req.body.url.slice(0,req.body.url.indexOf(popPath));
  	var referenceCheckTwo = req.body.url.slice(0,req.body.url.indexOf(popPathTwo));


  	setInterval(function(){

  		var filtered = matchedUrl.filter(function(i){
  			return (i.indexOf(referenceCheckOne) > -1)
  		})

  		var filteredTwo = matchedUrl.filter(function(i){
  			return (i.indexOf(referenceCheckTwo) > -1)
  		})

  		if(filteredTwo.length == 0){
  			eventCrawler.stop();
  			res.send(filtered)
  		} else if (filteredTwo.length>=10){
  			eventCrawler.stop();
  			res.send(filteredTwo)
  		} else {
  			console.log('Still checking...')
  		}

  	},20000)

  	eventCrawler.start();
  
});
Exemplo n.º 25
0
(function() {
'use strict';
var options = {
    connect: function (client) {
        var cp = client.connectionParameters;
    }
};

var pgp = require('pg-promise')(options);
var Crawler = require("simplecrawler");
var cheerio = require('cheerio');
var fs = require('fs');
var date = new Date();
var start;
var devId, fileId, fileExists, devExists;

var db = pgp('postgres://*****:*****@46.229.230.245:5432/wv011401db');

var crawler = new Crawler()
crawler.interval = 250;
crawler.maxDepth = 1;
crawler.maxConcurrency = 7;

//Get commit links from file
var dataCommitsUrl = fs.readFileSync('commits_urls.txt');
var array = dataCommitsUrl.toString().split('\n');
array.splice(array.length-1, 1);
array.forEach(function(element, index) {
  crawler.queue.add('http', 'git.eclipse.org', 80, element);
});


crawler.on('complete', function(){
  console.log("Crawl complete.");
  console.log("Running time: (s)"+(Date.now() - start)/1000);
});

crawler.on("crawlstart", function() {
  console.log("Crawl starting.");
  start = Date.now();
});

crawler.on("fetchstart", function(queueItem) {

});

crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {

  var $ = cheerio.load(responseBuffer);
  //Get dev's name
  var devName = $('.commit-info').children('tr:nth-child(1)')
    .children('td:nth-child(2)').text().trim();

  //Add dev to db
  db.any("select * from developers WHERE name LIKE '%$1^%'", devName)
      .then(function (data) {
          if(data.length > 0) {
            devId = data[0].id;
            handleFiles(responseBuffer, devId);
          } else {
            db.one("INSERT INTO developers(name) VALUES($1) returning id", devName)
                .then(function (data) {
                    devId = data.id;
                    handleFiles(responseBuffer, devId);
                })
                .catch(function (error) {
                    console.error("Inserting into developers failed: "+ error);
                });
          }
      })
      .catch(function (error) {
          console.error("Selecting from developers failed: "+ error);
      });
});

crawler.on("fetcherror", function(queueItem, response) {
  console.error("An error occured while fetching " + queueItem + " with respose"
  + respose);
});

//Add relation to mapping table
function insertDevFiles(commitId, fileId, changeCount) {
  var ids = {
    commitId: commitId,
    fileId: fileId,
    changeCount: changeCount
  }

  db.one("INSERT INTO files_commits(commit_id, file_id, change_count) VALUES(${commitId}, ${fileId},  ${changeCount}) returning id", ids)
      .then(function (data) {

      })
      .catch(function (error) {
          console.error("Inserting into files_commits failed: "+ error);
      });
}

// Get changed files and add them to db
function handleFiles(responseBuffer, devId) {
  var $ = cheerio.load(responseBuffer);
  var changeId;
  var commitInforText = $(".commit-msg").text();
  var bugUrl;

  if(commitInforText.indexOf("Task-Url") > -1) {
    var lines = commitInforText.split('\n');
    for(var i = 0; i < lines.length; i++) {
      if(lines[i].indexOf("Task-Url") > -1){
        var arr = lines[i].split(" ");
        bugUrl = arr[1].trim();
      }
    }
  }
  if(commitInforText.indexOf("Change-Id") > -1) {
    var lines = commitInforText.split('\n');
    for(var i = 0; i < lines.length; i++) {
      if(lines[i].indexOf("Change-Id") > -1){
        var arr = lines[i].split(":");
        changeId = arr[1].trim();
      }
    }
  }

  // insert into commit
  var ids = {
    devId: devId,
    bugUrl: bugUrl,
    changeId: changeId
  }

  db.one("INSERT INTO commits(developer_id, bug_id, change_id) VALUES(${devId}, ${bugUrl}, ${changeId}) returning id", ids)
      .then(function (data) {
        var commitId = data.id;
        handleIndividualFiles($, commitId);
      })
      .catch(function (error) {
          console.error("Inserting into commits failed: "+ error);
      });
}

function handleIndividualFiles($, commitId) {
  $('.upd a').each(function(i,e) {

    var name = $(e).text().trim();
    var changeCount = $(e).parent().parent().children('td:nth-child(3)').text().trim();

    db.any("select * from files WHERE name LIKE '%$1^%'", name)
        .then(function (data) {
            if(data.length > 0) {
              var fileId = data[0].id;
              insertDevFiles(commitId, fileId, changeCount);
            } else {
              db.one("INSERT INTO FILES(name) VALUES($1) returning id", name)
                  .then(function (data) {
                      var fileId = data.id;
                      insertDevFiles(commitId, fileId, changeCount);
                  })
                  .catch(function (error) {
                      console.error("Inserting into files failed: "+ error);
                  });
            }
        })
        .catch(function (error) {
            console.error("Selecting from files failed: "+ error);
        });
  });
}


crawler.start();
}());
Exemplo n.º 26
0
  it('should complete without error', function(done) {
    this.timeout(500000);
    var errors = 0;
    var crawler = new Crawler('localhost', '/', 8080);
    crawler.maxConcurrency = 10;
    crawler.userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36';
    crawler.acceptCookies = false;
    crawler.timeout = 20000;
    crawler.filterByDomain = false;
    crawler.interval = 5;
    crawler.supportedMimeTypes = [/^text\//i];
    crawler.downloadUnsupported = false;

    crawler.addFetchCondition(function(parsedUrl) {
      return parsedUrl.protocol !== 'mailto';
    });
    crawler.addFetchCondition(function(parsedUrl) {
      return !(parsedUrl.host === 'localhost' && parsedUrl.port === 35729);
    });
    crawler.addFetchCondition(function(parsedUrl) {
      return (parsedUrl.host !== 'vimeo.com');
    });
    crawler.addFetchCondition(function(parsedUrl) {
      // Broken webserver that returns 404 not found for regular pages
      return (parsedUrl.host !== 'www.emaxmodel.com');
    });
    crawler.addFetchCondition(function(parsedUrl) {
      return (parsedUrl.host !== '192.168.0.1');
    });

    crawler.addDownloadCondition(function(queueItem) {
      var uriis = classifyUrl(queueItem);
      return !uriis.external;
    });

    crawler.discoverResources = function(buf, queueItem) {
      var urlis = classifyUrl(queueItem);
      if (urlis.external || urlis.image) {
        return [];
      }

      var $ = cheerio.load(buf.toString(), {
        normalizeWhitespace: false,
        xmlMode: false,
        decodeEntities: true
      });

      var parsedUrl = url.parse(queueItem.url);
      // is this the redirector page? follow device tree from here
      // this might make the crawl take ALOT longer
      if ($('#device-redirector').length === 1) {
        // determine if fromUrl was device specific
        var selectDevice;
        var parsedFromUrl = url.parse(queueItem.referrer);
        var devicePath = _.intersection(parsedFromUrl.pathname.split('/'), devices);
        if (devicePath.length > 0) {
          selectDevice = devicePath[0];
        }

        $('ul.devices').find('a').each(function(index, a) {
          // if we come from a device-specific page, only choose that device link forward
          if (selectDevice && $(a).attr('id') !== (selectDevice + '-link')) {
            return;
          }

          var toQueueUrl = $(a).attr('href');

          // include hash used to access redirector
          var absolutePath = url.resolve(queueItem.url, toQueueUrl) + (parsedUrl.hash || '');
          // preserve original fromUrl and content
          // c.queue([{
          //   uri: absolutePath,
          //   callback: crawlCallback.bind(null, fromUrl, absolutePath, content)
          // }]);
          if (!queueItem.meta) {
            console.log(queueItem);
          }
          crawler.queueURL(absolutePath, queueItem, { content: queueItem.meta.content });
        });
        return [];
      }

      // make sure the hash used is valid on this page
      if (parsedUrl.hash) {
        if (isPullRequest && urlis.autogeneratedApiLink) {
          return [];
        }

        if ($(parsedUrl.hash).length === 0) {
          console.error(chalk.red(util.format('ERROR: 404 (missing hash) ON %s CONTENT %s LINKS TO %s', queueItem.referrer, queueItem.meta.content, queueItem.url)));
          errors++;
        }
        // only check the hash here
        // let the non-hash version crawl the rest of the tree
        return [];
      }

      $('a').each(function(index, a) {
        var toQueueUrl = $(a).attr('href');
        var linkContent = $(a).text();
        if (!toQueueUrl) return;

        if (toQueueUrl.indexOf('#') === 0 && toQueueUrl.length > 1) {
          if (isPullRequest && urlis.autogeneratedApiLink) {
            return;
          }

          if ($(toQueueUrl).length === 0) {
            console.error(chalk.red(util.format('ERROR: 404 relative link ON %s CONTENT %s LINKS TO %s', queueItem.url, linkContent, toQueueUrl)));
            errors++;
          }
        }

        if (!shouldCrawl(toQueueUrl)) {
          return;
        }
        var absolutePath = url.resolve(queueItem.url, toQueueUrl);
        // Remove hash
        absolutePath = absolutePath.replace(/#.*/, '');
        crawler.queueURL(absolutePath, queueItem, { content: linkContent });
      });

      $('img').each(function (index, img) {
        var toQueueUrl = $(img).attr('src');
        if (!toQueueUrl) return;

        toQueueUrl = url.resolve(queueItem.url, toQueueUrl);
        crawler.queueURL(toQueueUrl, queueItem, { content: 'image' });
      });

      return [];
    };

    // crawler.on('fetchstart', function(queueItem) {
    //   console.log('start', queueItem.url);
    // });

    // crawler.on('fetchheaders', function(queueItem, response) {
    //   console.log('headers', queueItem.url, complete, len);
    // });

    // crawler.on('fetchcomplete', function(queueItem) {
    //   console.log('complete', queueItem.url);
    // });

    crawler.on('fetchtimeout', function (queueItem) {
      var msg = util.format('timeout ON %s CONTENT %s LINKS TO %s', queueItem.referrer, queueItem.meta.content, queueItem.url);
      var urlis = classifyUrl(queueItem);
      if (urlis.external) {
        console.log(chalk.yellow('WARN: ' + msg));
      } else {
        console.error(chalk.red('ERROR: ' + msg));
        errors++;
      }
    });

    function fetchResultError(queueItem, response) {
      if (queueItem.stateData.code === 429) {
        return;
      }
      if (queueItem.stateData.code === 200) {
        return;
      }

      var urlis = classifyUrl(queueItem);
      if ((isPullRequest && urlis.githubEditLink && queueItem.stateData.code === 404) ||
          (isPullRequest && urlis.autogeneratedApiLink && queueItem.stateData.code === 404)) {
        return;
      }

      var msg = util.format('%s ON %s CONTENT %s LINKS TO %s', queueItem.stateData.code, queueItem.referrer, queueItem.meta.content, queueItem.url);
      if (urlis.external && Math.floor(queueItem.stateData.code / 100) === 5) {
        // allow 5XX status codes on external links
        console.log(chalk.yellow('WARN: ' + msg));
        return;
      }
      console.error(chalk.red('ERROR: ' + msg));
      errors++;
    }

    crawler.on('fetch404', fetchResultError);
    crawler.on('fetcherror', fetchResultError);
    crawler.on('complete', function() {
      if (errors > 0) {
        return done(new Error('There are ' + errors + ' broken link(s)'));
      }
      return done();
    });
    crawler.start();
  });
Exemplo n.º 27
0
  queue.push(function(cb) {
    var crawler = new Crawler(url);
    var object = {};

    crawler.timeout = 1000;
    crawler.stripWWWDomain = true;
    crawler.stripQuerystring = true;

    crawler.addFetchCondition(function(parsedURL) {
      if (parsedURL.path.match(/\.(css|jpg|pdf|gif|docx|js|png|ico)/i)) {
        return false;
      }
      return true;
    });

    crawler.on("crawlstart",function() {
      console.log("Crawl starting");
    });

    crawler.on("fetchheaders",function(item, response) {

    });


    crawler.on("fetchstart",function(item) {
      console.log("Fetching " + item.url);
    });

    crawler.on("fetchcomplete",function(item, buffer, response) {
      var html = buffer.toString();
      var array = extractEmails(html, domain);
      if (array && array.length > 0) {
        var domain = crawler.host.replace(/.*?:\/\//g, "");
        domain = domain.replace('www.', '');

        array.forEach(function(address) {
          if (address.indexOf(domain) != -1) {
            object[address] = item.url;
          }
        });
      }
    });

    crawler.on("complete",function() {

      console.log("Finished, now adding");

      var promises = [];
      Object.keys(object).forEach(function(address) {

        var deferred = Q.defer();
        promises.push(deferred.promise);
        Directory.findOne({email: address}, function(err, directory) {
          if (err)
            console.log(err);
          else if (!directory) {
            directory = new Directory({
              email: address,
              host: object[address],
              survey_ids: [doc._id]
            });
            directory.save(function(err, directory) {
              if (err)
                console.log(err);
              else
                console.log("Added " + address);
              deferred.resolve();
            });
          }
          else {
            directory.update({"$push": {"survey_ids": doc._id}}, function(err) {
              if (err) console.log(err);
              console.log("Updated " + address);
              deferred.resolve();
            });
          }
        });
      });
      if (promises.length > 0) {
        Q.all(promises).then(function() {
          console.log('Done with queue item');
          cb();
        });
      }
      else {
        console.log('Done with queue item');
        cb();
      }

    });

    crawler.start();

    doc.update({crawled: true}, function(err) {
      if (err) console.log(err);
    });


  });
Exemplo n.º 28
0
    function crawl(i) {
        if (i){
            console.log("crawling "+i);

            var crawlee = new Crawler(i);

            // TODO: allow setting these values in pa11y.yaml
            crawlee.interval = interval;
            crawlee.maxConcurrency = maxConcurrency;
            crawlee.maxDepth = depth;
            crawlee.downloadUnsupported = false;
            crawlee.discoverRegex = [
                /(\shref\s?=\s?|url\()([^\"\'\s>\)]+)/ig,
                /(?!.*\.(ico|css|ttf|js|xml|svg|jpg|png)\")(\shref\s?=\s?|url\()['"]([^"']+)/ig,
                /http(s)?\:\/\/[^?\s><\'\"]+/ig,
                /url\([^\)]+/ig,
                /^javascript\:[a-z0-9\$\_\.]+\(['"][^'"\s]+/ig
            ];

            crawlee.on("fetchcomplete", function (queueItem, responseBuffer, response) {
                if (response.headers['content-type'].match(/text\/html/)) {
                    out_urls.push(queueItem.url);
                    console.log("found new url: "+queueItem.url);
                } else {
                    console.log("non-html: "+queueItem.url);
                }
            });

            crawlee.on("complete", function (){
                console.log("finished crawling "+i);
                return crawl(in_urls.shift());
            });

            crawlee.start();
        } else {
            console.log("sending "+out_urls.length+" pages to pa11y");
            function access (i, port){
                if (i){
                    console.log("analyzing "+i);
                    pa11y.sniff({
                        url: i,
                        standard: standard,
                        timeout: timeout,
                        height: height,
                        width: width,
                        port: port
                    }, function (err, results){
                        var k = i.match(/\/\/(www\.)?(.*?)\//);
                        accessibility[k[2]]["total"] += results.count.total;
                        accessibility[k[2]]["errors"] += results.count.error;
                        accessibility[k[2]]["warnings"] += results.count.warning;
                        accessibility[k[2]]["notices"] += results.count.notice;
                        accessibility["overall"]["total"] += results.count.total;
                        accessibility["overall"]["errors"] += results.count.error;
                        accessibility["overall"]["warnings"] += results.count.warning;
                        accessibility["overall"]["notices"] += results.count.notice;

                        portscanner.findAPortNotInUse('13200', '13300', '127.0.0.1', function (error, port){
                            return access(out_urls.shift(), port);
                        });                 
                    });
                } else {
                    Object.keys(accessibility).forEach(function (key){
                        console.log(accessibility[key].total);
                        var client = pg.connect(conString, function (err, client, done){
                            client.query("INSERT INTO results VALUES ('"+repo+"', '"+key+"', '"+accessibility[key].total+"', '"+accessibility[key].errors+"', '"+accessibility[key].warnings+"', '"+accessibility[key].notices+"');", function (err, res, done){
                                if (err){
                                    console.log("ERROR: "+err);
                                }
                            });
                            done();
                        });
                    });
                }
            }
            portscanner.findAPortNotInUse('13200', '13300', '127.0.0.1', function (error, port){
                access(out_urls.shift(), port);
            }) ;
        }
    }
Exemplo n.º 29
0
runServer().then(server => {
  const crawler = new Crawler(`${server.url}/`);
  const origDiscoverResources = crawler.discoverResources;
  const referrers = {};
  const notFound = [];
  const duplicateIds = [];

  crawler.discoverResources = function(buffer, item) {
    if (/^text\/html/.test(item.stateData.contentType)) {
      const $ = cheerio.load(buffer.toString("utf8"));
      const ids = findDuplicateIds($);

      if (ids.length) {
        duplicateIds.push({ item, ids });
      }
    }
    return origDiscoverResources.apply(this, arguments);
  };

  crawler.addFetchCondition((item, referrerItem, cb) => {
    cb(null, shouldFetch(item, referrerItem));
  });

  crawler.maxDepth = 99;
  crawler.interval = 1;
  crawler.on("discoverycomplete", (item, resources) => {
    resources.forEach(url => {
      if (!(url in referrers)) {
        referrers[url] = [];
      }
      referrers[url].push(item.path);
    });
  });
  crawler.on("fetch404", (item, res) => {
    notFound.push(item);
  });
  crawler.on("complete", () => {
    server.httpServer.close(() => {
      let errors = 0;
      let warnings = 0;
      const makeLabelForPaths = paths => {
        const isWarning = paths.every(path => WARNING_PAGES.includes(path));
        const label = isWarning ? WARNING : ERROR;

        if (isWarning) {
          warnings++;
        } else {
          errors++;
        }

        return label;
      };

      duplicateIds.forEach(({ item, ids }) => {
        const label = makeLabelForPaths([ item.path ]);
        console.log(`${label}: duplicate id attrs found at ${item.path}:`);
        console.log(`  ${ids.join(', ')}`);
      });

      notFound.forEach(item => {
        const refs = referrers[item.url];
        const label = makeLabelForPaths(refs);

        console.log(`${label}: 404 for ${item.path}`);
        console.log(`  ${refs.length} referrer(s) including at least:`,
                    refs.slice(0, 5));
      });

      WARNING_PAGES.forEach(path => {
        if (!(`${server.url}${path}` in referrers)) {
          console.log(`${ERROR}: ${path} was not visited!`);
          console.log(`  If this is not an error, please remove the path ` +
                      `from WARNING_PAGES.`);
          errors++;
        }
      });

      const success = errors === 0;

      console.log(`${errors} error(s) and ${warnings} warning(s) found.`);
      if (success) {
        console.log(chalk.green(`Hooray!`));
      } else {
        console.log(chalk.red(`Alas.`));
      }
      process.exit(success ? 0 : 1);
    });
  });
  crawler.start();
});
Exemplo n.º 30
0
    if (!new RegExp(blacklist.join("|")).test(link)) { //this prevents the crawler from crawling irrelevant/useless pages
      return link;
    }
  }).get()
};

crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
    var fetchedAddress = queueItem.url;
    //var $ = cheerio.load(responseBuffer.toString("utf8"));
    //$('.div-col.columns.column-count.column-count-2').nextAll().remove(); //removes reference section from article for ease of reading
    //var cleanedData = $('#mw-content-text').text() //assigning body of article to variable for testing/logging
    //console.log(fetchedAddress); //logs the link of article that it is currently crawling
    //console.log(cleanedData);
});

crawler.start(); //start crawler

crawler.on("crawlstart", function() {
  console.log("begin!");
});

crawler.on("complete", function() { //this event does not fire and the console hangs
  console.log("end!");
});



var originalEmit = crawler.emit;
crawler.emit = function(evtName, queueItem) {
   crawler.queue.complete(function(err, completeCount) {
       if (err) {