function scrapStationDetail(){ console.log('station detail') if(!stations_url.length){ makegeoJSON(); return; } var station_url = stations_url.pop(); scrap('http://www.vigicrues.gouv.fr/'+station_url.url+'&ong=3', function(err,$){ var spans = $('.contenu_cadre p span'); var txt = $('.contenu_cadre div').text(); var reg = /\r\n\t Station : \r\n\t (.*)\r\n\t Département : \r\n\t ([0-9]{2})\r\n\t \r\n\r\n\t Cours d\'eau : \r\n\t (.*)\r\n\t \r\n\r\n\t Coordonnées Lambert II : \r\n\t X=([0-9]+) m, Y=([0-9]+)/m //var reg =/.*Station.*/ var matches = txt.match(reg); var lambert2 ='+proj=lcc +lat_1=46.8 +lat_0=46.8 +lon_0=0 +k_0=0.99987742 +x_0=600000 +y_0=2200000 +a=6378249.2 +b=6356515 +towgs84=-168,-60,320,0,0,0,0 +pm=paris +units=m +no_defs '; console.log([parseInt(matches[4],10),parseInt(matches[5],10)]) var latlng = proj4(lambert2).inverse([parseInt(matches[4],10),parseInt(matches[5],10)]); var lng = latlng[0]; var lat = latlng[1]; var station ={ bassin:station_url.bassin, url:'http://www.vigicrues.gouv.fr/'+station_url.url, station:matches[1], department:matches[2], river:matches[3], X:matches[4], Y:matches[5], lat:lat, lng:lng } console.log(station) stations.push(station); scrapStationDetail(); }); }
function getMessage(msg, callback) { var url = msg.url; var date =msg.date; var output = []; scrap({url: url, method: 'POST', jar:cookieJar, encoding:'iso-8859-1'}, function(err,$,code,html,resp) { var asunto = ""; $('.legendFicha').each(function(index) { if(this.text().match(/Asunto:/)) asunto = this.next(false).text().trim(); }) var body = $('#cuerpoMensaje').html(); var matches = body.match(/(https?:\/\/www.youtube.com\/watch\?v=[^\s|\^"<]+)/g); if( matches ) { // make them unique matches = matches.filter(function(item, pos) { return matches.indexOf(item) == pos; }); output.push( renderTitle(asunto, date) ); output.push( "<div>" ); matches.forEach(function(m) { output.push( renderLink(m) ); }) output.push( "</div>" ); } callback(null, output.join('\n')); }); }
fs.readFile(configPath, function (err, contents) { if (err) { return loaded_callback(err); } var config; try { config = JSON.parse(contents, 'utf8'); } catch (loadErr) { return loaded_callback(loadErr); } if (!config || !config.rules) { return loaded_callback(new Error("Invalid configuration file")); } // Grab the eslint rules page scrap(eslintRulesPage, function (err, $) { if (err) { return loaded_callback(err); } // Scrap rule categories async.each($('h2'), function (item, headerProcessed_callback) { item = $(item); var headerText = item.text(); var summary = item.next('p'); var category = { name: headerText, description: summary.text(), rules: [] }; templateContext.categories.push(category); // Scrap rules for this category async.eachSeries(summary.next().find('li'), function (rule, ruleProcessed_callback) { rule = $(rule); var ruleParts = rule.text().trim().split(' - '); var ruleName = ruleParts[0]; var configRule = config.rules[ruleName]; category.rules.push({ used: configRule && configRule.length ? configRule[0] : configRule, value: configRule, name: ruleName, link: eslintRulesPage + ruleName + ".html", description: ruleParts[1] }); ruleProcessed_callback(null); }, headerProcessed_callback); }, function (err) { if (err) { return loaded_callback(err); } loaded_callback(null, templateContext); }); }); });
function scrapBassins(){ scrap('http://www.vigicrues.gouv.fr/niv2.php', function(err, $) { var links = $('.contenu_cadre a'); links.each(function(pos,item){ bassins.push({ url:item.attribs.href, text:item.children[0].data }); }); scrapStations(); }); }
$( '[id$=_member]>div:first-child' ).each( function() { i++; // console.log( i + ": " + $( this ).html() ); var minder = {}; minder.name = $( this ).find( 'h2' ).first().text(); minder.membership = $( this ).find( 'h4' ).text(); minder.url = 'http://www.ichild.co.uk' + $( this ).find( 'a' ).attr( 'href' ); // /directory/childcare/registered_childminders/manchester/M21/clairelmackie/ minder.postcode = 'Unknown'; if ( $( this ).find( 'a' ).length ) minder.postcode = $( this ).find( 'a' ).attr( 'href' ).match( /([a-z]+[0-9]+)/i )[1]; minder.distance = 'Unknown'; if ( $( this ).parent().find( 'div.grid_3 h4:contains("miles")' ).length ) { if ( matches = $( this ).parent().find( 'div.grid_3 h4:contains("miles")' ).text().match( /(\d+\.\d+)/i ) ) minder.distance = matches[1]; } minder.vacancy = "No"; if ( $( this ).parent().find( 'div.grid_3 .vacancy' ).length ) minder.vacancy = "Yes"; scrap(minder.url, function(err, $) { if ( err ) { console.log( err.code + " - " + minder.url ); return; } // console.log( ); $( '#view_listing h3:contains("Childcare Offered")' ).siblings( 'ul' ).find( 'li' ).each( function() { minder[ $( this ).find( 'h4' ).text() ] = 'No'; if ( $( this ).find( '.checkbox_ticked' ) ) minder[ $( this ).find( 'h4' ).text() ] = 'Yes'; } ); $( '#view_listing .details ul li' ).each( function() { minder[ $( this ).find( 'h6' ).text() ] = 'No'; if ( $( this ).find( '.checkbox_ticked' ) ) minder[ $( this ).find( 'h6' ).text() ] = 'Yes'; } ); minder.fees_statement = $( '.home_section_title:contains("£")' ).text().replace( '\n', '' ).replace( '\r', '' ).replace( /\s+/g, ' ' ).trim(); $( 'strong:contains("Current vacancies:")' ).parent().find( 'div,span' ) minder.current_baby_vacancies = 0; if ( $( 'span[style*="#FF809F"]' ).first().length ) minder.current_baby_vacancies = $( 'span[style*="#FF809F"]' ).first().text().match( /(\d+)/i )[1]; // if ( $( 'span[style*="#FF809F"]' ) ) console.log( JSON.stringify( minder ) + ',' ); }); minders.push( minder ); } );
function scrapStations(){ console.log('station ') if(!bassins.length){ scrapStationDetail(); return; } var bassin = bassins.pop(); scrap('http://www.vigicrues.gouv.fr/'+bassin.url, function(err,$){ $('area[shape=circle]') .each(function(pos,area){ stations_url.push({ url:area.attribs.href, bassin:bassin.text }) }); scrapStations(); }); }
function getPageMessages(pagina, callback) { headers['X-AjaxPro-Method'] = 'listaMensajes'; var payload = '{"sControls":"<root><carpeta>1</carpeta><filtro></filtro><noLeidos>false</noLeidos><pagina>' + pagina + '</pagina><orden>fecha</orden><sentido>DESC</sentido></root>"}'; scrap({url: messagesUrl, preParse:preParseValue, method: 'POST', jar:cookieJar, headers:headers, body:payload }, function(err,$,code,html,resp) { var elements = $("tr td[onclick]"); var links = []; elements.each(function(index) { if(index % 3 == 2) { var el = this; var onclick = $(el).attr('onclick').trim(); var url = onclick.match(/location.href='(.+)'/)[1]; var date = $(el).text(); links.push({ url: oneMessageUrl + url, date: date }); } }); callback(null,links); }); }
var iterator = function(term, href, depth, done) { var url; var parentNode; var newEdge = false; if (arguments.length === 2) { depth = 0; done = href; url = 'http://LANG.wikipedia.org/w/index.php?search=TERM' .replace('LANG', options.language) .replace('TERM', encodeURIComponent(term)); } else { newEdge = true; parentNode = term; url = 'http://LANG.wikipedia.orgHREF' .replace('LANG', options.language) .replace('HREF', href); } scrap(url, function(err, $) { if (err) { logger.error('Error on requesting', {internal: err}); return done(); } var count = 0; var newNode; try { newNode = $('#firstHeading span').text(); } catch(err) { logger.error('Error on scrapping', {internal: err}); return done(); } if (depth > options.depth) { return done(); } emitter.emit('new node', newNode); if (newEdge) { emitter.emit('new edge', { from: parentNode, to: newNode }); } var related = []; $('#mw-content-text p a').each(function(i, link) { if (parseInt(options.relation, 10) === count) return false; if ($(link).text().charAt(0) !== '[') { count++; related.push($(link).attr('href')); } }); async.each( related, function(relatedNodeHref, callback) { iterator(newNode, relatedNodeHref, depth + 1, callback); }, done ); }); };
scrap({url:loginUrl, jar:cookieJar}, function(err,$) { var form = {}; $('[type=hidden]').each(function(i,el) { var key = $(el).attr('name'); var val = $(el).attr('value'); form[key] = val; }); form.txtUserName = secret.name; form.txtUserPass = secret.pwd; form.__EVENTTARGET = 'cmdLogin'; form.__EVENTARGUMENT = undefined; request.post({url:loginUrl, jar:cookieJar, form: form, headers:headers, followAllRedirects:true}, function(err,response,body) { renderPageHeader(); var paginas = [1,2]; async.map(paginas, getPageMessages, function(err,messages) { // flatten array and filter messages = [].concat.apply([],messages).filter(function(m) { var components = m.date.split(' '); var c1 = components[0].split('/'); var c2 = components[1].split(':'); var date = new Date( c1[2], c1[1]-1, c1[0], c2[0], c2[1]) return date >= new Date(2015,8,1) // 01/Sep/2015 }); async.map(messages, getMessage, function(err,outputs) { //console.log(outputs.length); outputs.forEach(function(output) { if(output.length) console.log(output); }); renderPageFooter(); }) }); }); });
var hn = function(bot, data, nick, args, end) { var param = args[0]; if (args.length > 1 || (param !== 'top' && param !== 'random')) { bot.message('Comando inválido. Exemplos: !hn top | !hn random'); return false; } var hackerNewsURL = 'https://news.ycombinator.com/'; scrap(hackerNewsURL, function(err, $) { if (err) { bot.message('Erro ao requisitar Hacker News'); return false; } var news = {}; var currentItem = {}; $('tr').each(function() { if (currentItem.done) { news[currentItem.points] = currentItem; currentItem = {}; } // scraping do título e da URL da notícia if ($(this).find('td.title a').length === 1) { currentItem.title = $(this).find('td.title a').text(); currentItem.url = $(this).find('td.title a').attr('href'); } if ($(this).find('td.subtext').length === 1) { // scraping da thread da notícia var threadURI = $(this).find('td.subtext a').next().attr('href'); currentItem.thread = hackerNewsURL + threadURI; // scraping da quantidade de pontos e comentários var subtext = $(this).find('td.subtext').text().split(' '); currentItem.points = parseInt(subtext[0]); if (!isNaN(subtext[subtext.length - 2])) { currentItem.comments = parseInt(subtext[subtext.length - 2]); } else { currentItem.comments = 0; } // terminando de "parsear" a notícia currentItem.done = true; } }); var printItem = function(item) { bot.message(item.title + ' (' + item.url + ')'); bot.message('HN: ' + item.thread); bot.message('Pontos: ' + item.points + ' | Comentários: ' + item.comments); end(); }; var keyPoints = Object.keys(news); if (param === 'top') { var topItemKey = keyPoints[keyPoints.length - 1]; var topItem = news[topItemKey]; printItem(topItem); } else { var randomKey = keyPoints[Math.floor(Math.random() * keyPoints.length)]; var randomItem = news[randomKey]; printItem(randomItem); } }); };