This repository has been archived by the owner on Feb 7, 2023. It is now read-only.
/
site-scanner.js
114 lines (97 loc) · 2.92 KB
/
site-scanner.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// Reads file with domains, and invokes phantomjs with site.js for each domain. Runs 20 phantomjs in parallel
var fs = require('fs'),
lazy = require('lazy'),
repo = require('retire/lib/repo'),
Emitter = require('events').EventEmitter,
path = require('path'),
spawn = require('child_process').spawn;
var threads_max = 20;
var threads = 0;
var events = new Emitter();
var donedir = 'phantom-done';
var tmpdir = 'tmp';
if (!fs.existsSync(donedir)) fs.mkdirSync(donedir);
if (!fs.existsSync(tmpdir)) fs.mkdirSync(tmpdir);
var funcsfile = tmpdir + '/jsfuncs.js';
var timeBegin = new Date().getTime();
if (process.argv.length !== 3) {
console.log('Usage: node site-scanner.js <file-with-domain-list>');
process.exit();
}
var dfile = process.argv[2];
function scan(url) {
console.log('Scanning\t' + url + ' ...');
var timeout;
var begin = new Date().getTime();
var child = spawn('phantomjs', ['--load-images=false', '--ssl-protocol=tlsv1', 'site.js', funcsfile, url]);
var timed_out = {"value": false };
child.on('close', function(code) {
var end = (new Date().getTime() - begin) + 'ms';
if (code !== 0) {
if (timed_out.value) {
console.warn('Timeout\t' + code + ' ' + url, end );
} else {
console.warn('Error\t' + code + ' ' + url, end );
}
} else {
console.log('Done\t\t' + url, end);
}
threads--;
clearTimeout(timeout);
events.emit('phantom-ready');
});
timeout = setTimeout(function() {
timed_out.value = true;
child.kill();
}, 2*60*1000);
}
var domains = [];
events.on('domain', function(domain) {
domains.push(domain);
if(domains.length % 1000 === 0) {
console.log('Domains loaded: ' + domains.length);
}
events.emit('phantom-ready');
});
var dix = 0;
var num = 0;
events.on('phantom-ready', function() {
if (dix >= (domains.length) || threads >= threads_max) return;
var domain = domains[dix++].toString();
if (!fs.existsSync(path.join(donedir, domain.replace(/\//g, "_") + '.log'))) {
threads++;
num++;
console.log(threads, num, dix + '/' + domains.length, '(' + Math.round(dix/domains.length*100) + '%)' , Math.round((new Date().getTime() - timeBegin)/1000) + 's');
scan(domain);
} else {
events.emit('phantom-ready');
}
});
events.on('begin', function() {
console.log('Reading file ' + dfile + ' ...');
new lazy(fs.createReadStream(dfile))
.lines
.forEach(function(domain) {
if (domain) events.emit('domain', domain);
});
});
if (!fs.existsSync(funcsfile)) {
repo.loadrepository('https://raw.github.com/bekk/retire.js/master/repository/jsrepository.json', {nocache:true})
.on('done', function(jsrepo) {
var funcs = [];
for (var i in jsrepo) {
for (var j in jsrepo[i].extractors.func) {
funcs.push({"component" : i, func : jsrepo[i].extractors.func[j]});
}
}
fs.writeFile(funcsfile, JSON.stringify(funcs), function(err) {
if (err) {
console.log(err);
process.exit(1);
}
events.emit('begin');
});
});
}else {
events.emit('begin');
}