Beispiel #1
0
    PdfColorInspector.prototype.canApplyGrayscale = function (pathToPdf, callback) {
        // Determine the number of pages
        var PFParser = require("pdf2json");
        var pdfParser = new PFParser();
        var that = this;

        var conclude = function (isGrayscale) {
            if (!isGrayscale)
                that.isGrayscale = false;
            that.pagesProcessed++;

            if (that.pagesProcessed === that.pagesToProcess) {
                callback(that.isGrayscale);
            }
        };

        pdfParser.on("pdfParser_dataReady", function (data) {
            that.pagesToProcess = data.PDFJS.pages.length;

            for (var i = 1; i <= that.pagesToProcess; i++) {
                // Render PNG with GhostScript
                that.determineGreyscaleForPage(pathToPdf, i, conclude);
            }
        });
        pdfParser.on("pdfParser_dataError", function (error) {
            throw error;
        });
        pdfParser.loadPDF(pathToPdf);
    };
Beispiel #2
0
module.exports = function(path, cb) {
  var parser = new Parser()
  parser.on('pdfParser_dataReady', function(result) {

    var text = []

    //get text on a particular page
    result.data.Pages.forEach(function(page) {
      var chunks = _(page.Texts).map('R').flatten().map('T').map(decodeURIComponent).value()
      text = text.concat(chunks)
    })

    parser.destroy()

    setImmediate(function() {
      cb(null, text)
    })
  })

  parser.on('pdfParser_dataError', function(err) {
    parser.destroy()
    cb(err)
  })
  if(path instanceof Buffer) {
    return parser.parsePDFBuffer(path)
  }
  parser.loadPDF(path)
}
Beispiel #3
0
    insertFile(req.file, function(){
      var pdfParser = new PDFParser();
      pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));

      pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));

      pdfParser.loadPDF(req.file.path);
      // pdfParser = null;
      res.redirect('/');
    });
 readstream.on('open', function() {
     var store = readstream._store;
     var filename = store.filename;
     var newPath = config.tempdir + '/' + req.files.file.id + '_' + filename;
     // Stage file
     var writestream = fs.createWriteStream(newPath);
     readstream.pipe(writestream);
     if (config.debug)
         console.log('File stored in a new location:' + newPath);
     // Recognise text of any language in any format
     var pdfParser = new PFParser();
     pdfParser.on("pdfParser_dataReady", function(evtData) {
         try {
             var textData = evtData.data;
             db.collection(req.params.collection, function(err, collection) {
                 // Extract Text portion and save it in a separate
                 // node
                 var data = jsonpath(textData, "$..T");
                 var aggregateData = '';
                 for (var i = 0; i < data.length; i++) {
                     aggregateData += data[i];
                 }
                 aggregateData = querystring.unescape(aggregateData);
                 textData.text = aggregateData;
                 textData.uploadDate = new Date();
                 //var dataTransform = require('../../lib/dataTransform.js')(config);
                 //textData = dataTransform.toComputableJSON(textData);
                 collection.insert(Array.isArray(textData) ? textData[0] : textData, function(err, docs) {
                     if (err)
                         return next(err);
                     res.locals.items = textData;
                     res.locals.docs = docs;
                     event.emit("i", req, res);
                     return next();
                 });
             });
             evtData.destroy();
             evtData = null;
         } catch (err) {
             // Ignore promise being closed
             if (config.debug)
                 console.log(err);
         }
     });
     pdfParser.on("pdfParser_dataError", function(evtData) {
         evtData.destroy();
         evtData = null;
         if (config.debug) {
             console.log('Error occured when converting PDF:' + evtData);
         }
         return next();
     });
     pdfParser.loadPDF(newPath);
 });
Beispiel #5
0
    function parse(file) {

        //For testing
        file = 'enk2014';

        var urlDecodedFile = 'urldecoded.txt';
        var textData;
        var json;

        pdfParser.on("pdfParser_dataReady", pdfData => {
            json = JSON.parse(decodeURI(JSON.stringify(pdfData)));
            textData = parseHelpers.getTextItems(json.formImage.Pages);

            fs.writeFile("./" + urlDecodedFile, textData.join('\n'));
            fs.writeFile("./" + file + '.json', enLongParser(textData));
        });

        pdfParser.loadPDF(file + '.pdf')
    }
Beispiel #6
0
function pdfParserPromise(fileName) {
	console.log('Reading', fileName);
	var promise = new(events.EventEmitter);

	var pdfParser = new PDFParser();

	pdfParser.on("pdfParser_dataReady", function(evtData) {
	if ((!!evtData) && (!!evtData.data)) {
		promise.emit('success', evtData);
	}
	else {
		promise.emit('error', new Error());
	}
	});

	pdfParser.on("pdfParser_dataError", function(evtData) {
		promise.emit('error', evtData.data);
	});

	pdfParser.loadPDF(fileName);

	return promise;
};
let fs = require('fs'),
        PDFParser = require("pdf2json");

    let pdfParser = new PDFParser();

    pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
    pdfParser.on("pdfParser_dataReady", pdfData => {
        fs.writeFile("testPDFrip.json", JSON.stringify(pdfData));
    });

    pdfParser.loadPDF("HCC-consent-app-form.PDF");
Beispiel #8
0
    obtenerDatosPDF(archivo, done, extra) {
        let pdfParser = new PDFParser()
        let resultado = new ResultadoParserPDF(archivo, extra)

        pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError))

        pdfParser.on("pdfParser_dataReady", pdfData => {
            // Son comprobantes siempre tienen una sola pagina
            let textos = pdfData.formImage.Pages[0].Texts

            const beVerbose = process.env.VERBOSE

            if (beVerbose) {
                logger.debug({ message: "=DEBUG= Full pdf text parse" })
                logger.debug({ message: "=DEBUG=" + JSON.stringify(textos) })
            }

            // Agrupar como lineas textos con la misma coordenada "y"
            let agrupados = {}
            for (let i = 0; i < textos.length; i++) {
                let coordenadaY = Math.floor(textos[i].y)
                if (agrupados[coordenadaY.toString()] === undefined) {
                    agrupados[coordenadaY.toString()] = { texto: "" }
                }

                agrupados[coordenadaY.toString()].texto += decodeURIComponent(textos[i].R[0].T) + " "
            }

            for (let key in agrupados) {
                let texto = agrupados[key].texto.trim()

                if (beVerbose) {
                    logger.debug({ message: "=DEBUG= Texto=" + texto })
                    logger.debug({ message: "=DEBUG= Linea Y=" + key })
                    logger.debug({ message: "=DEBUG= Es Tipo = " + resultado.esTipo(texto) })
                    logger.debug({ message: "=DEBUG= Es Cliente = " + resultado.esCliente(texto) })
                    logger.debug({ message: "=DEBUG= Es Importe = " + resultado.esImporte(texto) })
                    logger.debug({ message: "=DEBUG= Resultado=" + JSON.stringify(resultado) })
                }

                if (resultado.esTipo(texto)) {
                    resultado.cargarTipo(texto)
                }

                if (resultado.esCliente(texto)) {
                    // CLIENTE
                    resultado.cargarCliente(texto)
                }

                if (resultado.esImporte(texto)) {
                    // IMPORTE
                    resultado.cargarImporte(texto)
                }

                if (resultado.completo()) {
                    break;
                }
            };

            if (!resultado.completo()) {
                logger.debug({ message: "=ERROR= No se completo el resultado => " + JSON.stringify(resultado) })
            }

            done(resultado);
        });

        pdfParser.loadPDF(archivo)
    }
if(file=="pdf"){
	let fs = require('fs'),
        PDFParser = require("pdf2json");
 
    let pdfParser = new PDFParser(this,1);
 
    pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
    pdfParser.on("pdfParser_dataReady", pdfData => {
        console.log(pdfParser.getRawTextContent());
    });
 
    pdfParser.loadPDF("/Users/nehakapoor/Documents/GitHub/jobshout-server-nodejs/jobshout_server/NehaResume.pdf");
}else{
	var textract = require('textract');
	textract.fromFileWithPath("/Users/nehakapoor/Documents/GitHub/jobshout-server-nodejs/jobshout_server/NehaResume.docx", function( error, text ) {
		console.log(text);
	})
}
Beispiel #10
0
 return new Promise((resolve, reject) => {
   let pdfParser = new PDFParser();
   pdfParser.on("pdfParser_dataError", errData => {
     console.log("PDF Parse Error:");
     if (errData.parserError) {
       console.log(errData.parserError);
       reject(errData.parserError);
     }
     else {
       console.log(errData);
       reject(errData);
     }
   });
   pdfParser.on("pdfParser_dataReady", pdfData => {
     var page = pdfData.formImage.Pages[0];
     var texts = page.Texts;
     // Here's the mess we're parsing:
     // console.log(JSON.stringify(texts, null, ' '));
     //
     // We start parsing after the word "Friday",
     // then parse each day, concatenating each line.
     // Concatenate strings that are on the same y coordinate.
     // Add a newline between strings that are on different y coordinates.
     // URL-decode all strings.
     var days = {};
     var menu = {
       school: decodeURIComponent(texts[0].R[0].T).trim(),
       month: decodeURIComponent(texts[1].R[0].T).trim(),
       days: days
     }
     var day = {};
     var date = '';
     var item = {};
     var state = 'AWAIT_FRIDAY';
     var prevY = 0;
     texts.forEach(text => {
       var line = text.R[0].T;
       var y = text.y;
       if (state == 'AWAIT_FRIDAY') {
         if (line == 'Friday') {
           state = 'PROCESSING_DAYS';
         }
       }
       else if (state == 'PROCESSING_DAYS') {
         if (isNumeric(line)) {
           // New day
           date = line;
           day = { date: date, food: [], events: [] };
           days[line] = day;
           item = null;
         }
         else {
           var decodedLine = decodeURIComponent(line);
           var isEvent = isUpperCase(decodedLine);
           var isNewItem = (prevY != y || !item);
           if (isNewItem) {
             item = { text: '' };
             if (isEvent) {
               day.events.push(item);
             }
             else {
               day.food.push(item);
             }
           }
           item.text += decodedLine;
           prevY = y;
         }
       }
     });
     resolve(menu);
   });
   pdfParser.loadPDF(pdfFilePath);
 });