var scrapePDF = function (item, cb) { var filename = path.basename(item).replace('.pdf', ''); console.log('scraping pdf', filename); var rows_collect = []; var lines_collect = []; var pdfExtract = new PDFExtract(); pdfExtract.extract(filename + '.pdf', {}, function (err, data) { if (err) return console.log(err); if (debug) fs.writeFileSync(debugcache + filename + '.pdf.json', JSON.stringify(data, null, '\t')); async.forEachSeries(data.pages, function (page, next) { var lines = PDFExtract.utils.pageToLines(page, 0.3); if (debug) { fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t')); } lines = PDFExtract.utils.extractLines(lines, ['Restzahlung'], ['erstellt mit EurekaRLP']); if (lines.length == 0) { console.log('ALARM, page', page.pageInfo.num, 'without data'); } else if (debug) { lines_collect = lines_collect.concat(lines); fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t')); } // console.log(PDFExtract.utils.xStats(page)); /* 0-40 col 1 lfd. Nr. 40-200 col 2 Name des Begünstigten 200-400 col 2 BEZEICHNUNG DES VORHABENS 400-500 col 3 JAHR DER BEWILLIGUNG / RESTZAHLUNG 500-630 col 4 Bewilligter Betrag 630- col 5 BEI ABSCHLUSS DES VORHABENS GEZAHLTE GESAMTBETRÄGE */ // console.log(page.pageInfo.num); var rows = PDFExtract.utils.extractColumnRows(lines, [42, 200, 400, 500, 560, 1200], 0.2); rows = rows.filter(function (row) { return (row[0] || row[1] !== 'Summe Insgesamt'); }); rows_collect = rows_collect.concat(rows); next(); }, function (err) { if (err) return console.log(err); rows_collect = mergeMultiRows(rows_collect).filter(function (row) { if (!isValidRow(row)) { console.log('ALARM, invalid row', JSON.stringify(row)); return false; } else { return true; } }); if (debug) { fs.writeFileSync(debugcache + '_' + filename + '.items.json', JSON.stringify(lines_collect, null, '\t')); var sl = rows_collect.map(function (row) { return JSON.stringify(row); }); fs.writeFileSync(debugcache + '_' + filename + ".rows.json", '[' + sl.join(',\n') + ']'); } var cleanString = function (cell) { return (cell || '').trim(); }; var final = rows_collect.map(function (row) { return { _source: item, beneficiary: row[1] || '', name_of_operation: row[2] || '', years: row[3] || '', allocated_public_funding: cleanString(row[4]), on_finish_total_value: cleanString(row[5]) }; }); fs.writeFileSync(filename + ".json", JSON.stringify(final, null, '\t')); cb(err); }) }); };
var scrapePDF = function (item, cb) { var filename = path.basename(item).replace('.pdf', ''); console.log('scraping pdf', filename); var rows_collect = []; var lines_collect = []; var pdfExtract = new PDFExtract(); pdfExtract.extract(filename + '.pdf', {}, function (err, data) { if (err) return console.log(err); if (debug) fs.writeFileSync(debugcache + filename + '.pdf.json', JSON.stringify(data, null, '\t')); async.forEachSeries(data.pages, function (page, next) { if (page.pageInfo.num == 1) return next(); var alllines = PDFExtract.utils.pageToLines(page, 0.3); var lines = PDFExtract.utils.extractLines(alllines, ['Name des Begünstigten'], ['-------------------'/*take all*/]); if (lines.length == 0) lines = PDFExtract.utils.extractLines(alllines, ['Begünstigten'], ['-------------------'/*take all*/]); if (lines.length == 0) { console.log('ALARM, page', page.pageInfo.num, 'without data'); } else if (debug) { lines_collect = lines_collect.concat(lines); fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t')); } // console.log(PDFExtract.utils.xStats(page)); /* 0-150 col 1 Name des Begünstigten 150-500 col 2 BEZEICHNUNG DES VORHABENS 500-600 col 3 JAHR DER BEWILLIGUNG / RESTZAHLUNG 600-760 col 4 Bewilligter Betrag 760- col 5 BEI ABSCHLUSS DES VORHABENS GEZAHLTE GESAMTBETRÄGE */ // console.log(page.pageInfo.num); var offset1 = 0; var offset3 = 0; var offset4 = 0; if (page.pageInfo.num == 24) { offset3 = -5; } else if (page.pageInfo.num == 144) { offset3 = -20; } else if (page.pageInfo.num == 175) { offset3 = -30; offset4 = -10; } else if (page.pageInfo.num == 309) { offset3 = -20; } else if (page.pageInfo.num == 364) { offset3 = -10; } else if (page.pageInfo.num == 392) { offset3 = -20; } else if (page.pageInfo.num == 396) { offset3 = -40; offset4 = -15; } else if ((page.pageInfo.num == 56) || (page.pageInfo.num == 57)) { offset3 = -50; offset4 = -25; } else if (page.pageInfo.num == 515) { offset3 = -20; } else if (page.pageInfo.num == 582) { offset3 = -10; } else if (page.pageInfo.num == 583) { offset3 = -30; } else if (page.pageInfo.num == 584) { offset3 = -35; offset4 = -10; } else if (page.pageInfo.num == 590) { offset3 = -50; offset4 = -25; } else if (page.pageInfo.num == 600) { offset3 = -60; offset4 = -30; } else if (page.pageInfo.num == 601) { offset3 = -20; } else if (page.pageInfo.num == 602) { offset3 = -20; } else if (page.pageInfo.num == 603) { offset3 = -20; } else if (page.pageInfo.num == 657) { offset3 = -50; offset4 = -20; } else if (page.pageInfo.num == 678) { offset3 = -20; } else if (page.pageInfo.num == 721) { offset3 = -40; offset4 = -15; } else if (page.pageInfo.num == 801) { offset3 = -40; offset4 = -55; } else if (page.pageInfo.num == 802) { offset3 = -20; offset4 = -25; } else if (page.pageInfo.num == 119) { offset1 = -10; } else if (page.pageInfo.num == 120) { offset1 = -10; } else if (page.pageInfo.num == 121) { offset1 = -10; } else if (page.pageInfo.num == 134) { offset1 = -40; } else if (page.pageInfo.num == 135) { offset1 = -40; } else if (page.pageInfo.num == 136) { offset1 = -40; } else if (page.pageInfo.num == 137) { offset1 = -40; } else if ((page.pageInfo.num >= 371) && (page.pageInfo.num <= 400)) { offset1 = -40; } else if ((page.pageInfo.num >= 459) && (page.pageInfo.num <= 470)) { offset1 = -40; } else if (page.pageInfo.num == 707) { offset1 = -50; } else if ((page.pageInfo.num >= 736) && (page.pageInfo.num <= 737)) { offset1 = -40; } else if (page.pageInfo.num == 738) { offset1 = -50; } else if ((page.pageInfo.num >= 739) && (page.pageInfo.num <= 740)) { offset1 = -40; } else if ((page.pageInfo.num >= 835) && (page.pageInfo.num <= 836)) { offset1 = -40; } else if ((page.pageInfo.num >= 879) && (page.pageInfo.num <= 883)) { offset1 = -40; } var rows = PDFExtract.utils.extractColumnRows(lines, [140 + offset1, 500, 670 + offset3, 725 + offset4, 1200], 0.2); rows_collect = rows_collect.concat(rows); next(); }, function (err) { if (err) return console.log(err); rows_collect = mergeMultiRows(rows_collect).filter(function (row) { if (!isValidRow(row)) { console.log('ALARM, invalid row', JSON.stringify(row)); return false; } else { return true; } }); if (debug) { fs.writeFileSync(debugcache + '_' + filename + '.items.json', JSON.stringify(lines_collect, null, '\t')); var sl = rows_collect.map(function (row) { return JSON.stringify(row); }); fs.writeFileSync(debugcache + '_' + filename + ".rows.json", '[' + sl.join(',\n') + ']'); } var cleanString = function (cell) { return (cell || '').trim(); }; var final = rows_collect.map(function (row) { return { _source: item, beneficiary: row[0] || '', name_of_operation: row[1] || '', years: row[2] || '', allocated_public_funding: cleanString(row[3]), on_finish_total_value: cleanString(row[4]) }; }); fs.writeFileSync(filename + ".json", JSON.stringify(final, null, '\t')); cb(err); }) }); };