finish(function (results) { function dateStamp() { var d = new Date(); return orzo.sprintf('%02d%02d%02d', d.getFullYear(), d.getMonth() + 1, d.getDate()); } worklog.close(); doWith(orzo.fileWriter(orzo.sprintf(conf.tweets.crawlerOutFile, dateStamp())), function (fw) { results.sorted.each(function (key, data) { if (key !== '__ignored__') { data.forEach(function (dataItem) { fw.writeln(JSON.stringify({ id: dataItem.id, time: parseInt(dataItem.time), account: dataItem.account, text: dataItem.text })); }); } }); }, function (err) { orzo.printf('error: %s\n', err); } ); orzo.printf('ignored %s records\n', results.get('__ignored__')[0]); });
orzo.html.query(page, 'div.tweet', function (tweetBlock) { var out = {id: null, time: '-', account: url, text: null}; applyOnFirst(tweetBlock, 'span.js-short-timestamp', function (item) { out.time = item.dataset().time; }); applyOnFirst(tweetBlock, 'p.tweet-text', function (item) { out.text = item.text(); }); out.id = tweetBlock.dataset()['item-id']; if (worklog.getLatestTimestamp() <= parseInt(out.time)) { emit(out.time, out); } else { emit('__ignored__', 1); } });