示例#1
0
var scrapePDF = function (item, cb) {
	var filename = path.basename(item).replace('.pdf', '');
	console.log('scraping pdf', filename);
	var rows_collect = [];
	var lines_collect = [];
	var pdfExtract = new PDFExtract();
	pdfExtract.extract(filename + '.pdf', {}, function (err, data) {
		if (err) return console.log(err);
		if (debug)
			fs.writeFileSync(debugcache + filename + '.pdf.json', JSON.stringify(data, null, '\t'));
		async.forEachSeries(data.pages, function (page, next) {
			var lines = PDFExtract.utils.pageToLines(page, 0.3);
			if (debug) {
				fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t'));
			}
			lines = PDFExtract.utils.extractLines(lines, ['Restzahlung'], ['erstellt mit EurekaRLP']);
			if (lines.length == 0) {
				console.log('ALARM, page', page.pageInfo.num, 'without data');
			} else if (debug) {
				lines_collect = lines_collect.concat(lines);
				fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t'));
			}
			// console.log(PDFExtract.utils.xStats(page));
			/*

			 0-40 col 1
			 lfd. Nr.

			 40-200 col 2
			 Name des Begünstigten

			 200-400 col 2
			 BEZEICHNUNG DES VORHABENS

			 400-500 col 3
			 JAHR DER BEWILLIGUNG / RESTZAHLUNG

			 500-630 col 4
			 Bewilligter Betrag

			 630- col 5
			 BEI ABSCHLUSS DES VORHABENS GEZAHLTE GESAMTBETRÄGE

			 */

			// console.log(page.pageInfo.num);
			var rows = PDFExtract.utils.extractColumnRows(lines, [42, 200, 400, 500, 560, 1200], 0.2);
			rows = rows.filter(function (row) {
				return (row[0] || row[1] !== 'Summe Insgesamt');
			});
			rows_collect = rows_collect.concat(rows);
			next();
		}, function (err) {
			if (err) return console.log(err);

			rows_collect = mergeMultiRows(rows_collect).filter(function (row) {
				if (!isValidRow(row)) {
					console.log('ALARM, invalid row', JSON.stringify(row));
					return false;
				} else {
					return true;
				}
			});

			if (debug) {
				fs.writeFileSync(debugcache + '_' + filename + '.items.json', JSON.stringify(lines_collect, null, '\t'));
				var sl = rows_collect.map(function (row) {
					return JSON.stringify(row);
				});
				fs.writeFileSync(debugcache + '_' + filename + ".rows.json", '[' + sl.join(',\n') + ']');
			}
			var cleanString = function (cell) {
				return (cell || '').trim();
			};


			var final = rows_collect.map(function (row) {
				return {
					_source: item,
					beneficiary: row[1] || '',
					name_of_operation: row[2] || '',
					years: row[3] || '',
					allocated_public_funding: cleanString(row[4]),
					on_finish_total_value: cleanString(row[5])
				};
			});
			fs.writeFileSync(filename + ".json", JSON.stringify(final, null, '\t'));
			cb(err);
		})
	});
};
示例#2
0
var scrapePDF = function (item, cb) {
	var filename = path.basename(item).replace('.pdf', '');
	console.log('scraping pdf', filename);
	var rows_collect = [];
	var lines_collect = [];
	var pdfExtract = new PDFExtract();
	pdfExtract.extract(filename + '.pdf', {}, function (err, data) {
		if (err) return console.log(err);
		if (debug)
			fs.writeFileSync(debugcache + filename + '.pdf.json', JSON.stringify(data, null, '\t'));
		async.forEachSeries(data.pages, function (page, next) {
			if (page.pageInfo.num == 1) return next();
			var alllines = PDFExtract.utils.pageToLines(page, 0.3);
			var lines = PDFExtract.utils.extractLines(alllines, ['Name des Begünstigten'], ['-------------------'/*take all*/]);
			if (lines.length == 0)
				lines = PDFExtract.utils.extractLines(alllines, ['Begünstigten'], ['-------------------'/*take all*/]);
			if (lines.length == 0) {
				console.log('ALARM, page', page.pageInfo.num, 'without data');
			} else if (debug) {
				lines_collect = lines_collect.concat(lines);
				fs.writeFileSync(debugcache + filename + '-' + page.pageInfo.num + '.json', JSON.stringify(lines, null, '\t'));
			}
			// console.log(PDFExtract.utils.xStats(page));
			/*

			 0-150 col 1
			 Name des Begünstigten

			 150-500 col 2
			 BEZEICHNUNG DES VORHABENS

			 500-600 col 3
			 JAHR DER BEWILLIGUNG / RESTZAHLUNG

			 600-760 col 4
			 Bewilligter Betrag

			 760- col 5
			 BEI ABSCHLUSS DES VORHABENS GEZAHLTE GESAMTBETRÄGE

			 */

			// console.log(page.pageInfo.num);
			var offset1 = 0;
			var offset3 = 0;
			var offset4 = 0;
			if (page.pageInfo.num == 24) {
				offset3 = -5;
			} else if (page.pageInfo.num == 144) {
				offset3 = -20;
			} else if (page.pageInfo.num == 175) {
				offset3 = -30;
				offset4 = -10;
			} else if (page.pageInfo.num == 309) {
				offset3 = -20;
			} else if (page.pageInfo.num == 364) {
				offset3 = -10;
			} else if (page.pageInfo.num == 392) {
				offset3 = -20;
			} else if (page.pageInfo.num == 396) {
				offset3 = -40;
				offset4 = -15;
			} else if ((page.pageInfo.num == 56) || (page.pageInfo.num == 57)) {
				offset3 = -50;
				offset4 = -25;
			} else if (page.pageInfo.num == 515) {
				offset3 = -20;
			} else if (page.pageInfo.num == 582) {
				offset3 = -10;
			} else if (page.pageInfo.num == 583) {
				offset3 = -30;
			} else if (page.pageInfo.num == 584) {
				offset3 = -35;
				offset4 = -10;
			} else if (page.pageInfo.num == 590) {
				offset3 = -50;
				offset4 = -25;
			} else if (page.pageInfo.num == 600) {
				offset3 = -60;
				offset4 = -30;
			} else if (page.pageInfo.num == 601) {
				offset3 = -20;
			} else if (page.pageInfo.num == 602) {
				offset3 = -20;
			} else if (page.pageInfo.num == 603) {
				offset3 = -20;
			} else if (page.pageInfo.num == 657) {
				offset3 = -50;
				offset4 = -20;
			} else if (page.pageInfo.num == 678) {
				offset3 = -20;
			} else if (page.pageInfo.num == 721) {
				offset3 = -40;
				offset4 = -15;
			} else if (page.pageInfo.num == 801) {
				offset3 = -40;
				offset4 = -55;
			} else if (page.pageInfo.num == 802) {
				offset3 = -20;
				offset4 = -25;
			} else if (page.pageInfo.num == 119) {
				offset1 = -10;
			} else if (page.pageInfo.num == 120) {
				offset1 = -10;
			} else if (page.pageInfo.num == 121) {
				offset1 = -10;
			} else if (page.pageInfo.num == 134) {
				offset1 = -40;
			} else if (page.pageInfo.num == 135) {
				offset1 = -40;
			} else if (page.pageInfo.num == 136) {
				offset1 = -40;
			} else if (page.pageInfo.num == 137) {
				offset1 = -40;
			} else if ((page.pageInfo.num >= 371) && (page.pageInfo.num <= 400)) {
				offset1 = -40;
			} else if ((page.pageInfo.num >= 459) && (page.pageInfo.num <= 470)) {
				offset1 = -40;
			} else if (page.pageInfo.num == 707) {
				offset1 = -50;
			} else if ((page.pageInfo.num >= 736) && (page.pageInfo.num <= 737)) {
				offset1 = -40;
			} else if (page.pageInfo.num == 738) {
				offset1 = -50;
			} else if ((page.pageInfo.num >= 739) && (page.pageInfo.num <= 740)) {
				offset1 = -40;
			} else if ((page.pageInfo.num >= 835) && (page.pageInfo.num <= 836)) {
				offset1 = -40;
			} else if ((page.pageInfo.num >= 879) && (page.pageInfo.num <= 883)) {
				offset1 = -40;
			}
			var rows = PDFExtract.utils.extractColumnRows(lines, [140 + offset1, 500, 670 + offset3, 725 + offset4, 1200], 0.2);
			rows_collect = rows_collect.concat(rows);
			next();
		}, function (err) {
			if (err) return console.log(err);

			rows_collect = mergeMultiRows(rows_collect).filter(function (row) {
				if (!isValidRow(row)) {
					console.log('ALARM, invalid row', JSON.stringify(row));
					return false;
				} else {
					return true;
				}
			});

			if (debug) {
				fs.writeFileSync(debugcache + '_' + filename + '.items.json', JSON.stringify(lines_collect, null, '\t'));
				var sl = rows_collect.map(function (row) {
					return JSON.stringify(row);
				});
				fs.writeFileSync(debugcache + '_' + filename + ".rows.json", '[' + sl.join(',\n') + ']');
			}
			var cleanString = function (cell) {
				return (cell || '').trim();
			};


			var final = rows_collect.map(function (row) {
				return {
					_source: item,
					beneficiary: row[0] || '',
					name_of_operation: row[1] || '',
					years: row[2] || '',
					allocated_public_funding: cleanString(row[3]),
					on_finish_total_value: cleanString(row[4])
				};
			});
			fs.writeFileSync(filename + ".json", JSON.stringify(final, null, '\t'));
			cb(err);
		})
	});
};