Example #1
0
module.exports = function () {
  var currentCompletion;
  var completionWords = { rid: [], mime: [], rtype: [] };

  var rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout,
    completer: function (line) {
      var i = line.lastIndexOf(' ');
      var wordToComplete;
      if (i === -1) { wordToComplete = line; }
      else          { wordToComplete = line.substr(i + 1); }

      var matches = (completionWords[currentCompletion] || []).filter(function (word) {
        return (word.substr(0, wordToComplete.length) === wordToComplete);
      });

      return [matches, wordToComplete];
    }
  });

  rl.setPrompt('> ', 2);

  rl.write("\nThis tool will help you to initialize a platform.\n");
  rl.write("It will create the basic structure for you.\n");
  rl.write("Type \"exit\" anytime to cancel and leave.\n\n");

  rl.on('line', function(line) {
    if (line == 'exit') {
      rl.write('Cancelled\n');
      process.exit(0);
    }
  });

  var stack = new Stackware();
  var pkbDomains;
  var rootDir;

  // Setup autocompletion
  stack.use(function (manifest, next) {
    var toLoad = ['rid', 'mime', 'rtype'];

    (function loadNext() {
      var type = toLoad.pop();
      if (!type) { return next(); }

      fs.readFile(path.join(platformsDir, type + '.json'), function (err, content) {
        if (err) { return loadNext(); }

        var json;
        try {
          json = JSON.parse(content);
        } catch (e) {
          json = [];
        }
        json.forEach(function (element) {
          completionWords[type].push(element.code);
        });

        loadNext();
      });
    })();
  });

  stack.use(function (manifest, next) {
    rl.write("What's the short name of the platform ?\n");

    (function prompt() {
      rl.prompt();
      rl.once('line', function (shortName) {
        rootDir = path.join(__dirname, '../../platforms/', shortName);

        fs.exists(rootDir, function (exist) {
          if (exist) {
            rl.write("this platform already exists\n");
            return prompt();
          }
          manifest.name = shortName;
          next();
        });
      });
    })();
  });

  stack.use(function (manifest, next) {
    rl.write("What's the long name of the platform ?\n");
    rl.prompt();
    rl.once('line', function (longName) {
      manifest.longname = longName;
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write("How would you describe this parser ?\n");
    rl.prompt();
    rl.once('line', function (description) {
      manifest.describe = description;
      next();
    });
    rl.write('Identifie les consultations de la plateforme ' + manifest.longname);
  });

  stack.use(function (manifest, next) {
    rl.write("What's the URL of the analyzis ?\n");
    rl.prompt();
    rl.once('line', function (url) {
      manifest.docurl = url;
      next();
    });
    rl.write('http://analogist.couperin.org/platforms/' + manifest.name + '/');
  });

  stack.use(function (manifest, next) {
    rl.write("Who's the contact for this parser ?\n");
    rl.prompt();
    rl.once('line', function (contact) {
      manifest.contact = contact;
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write("Does it have a knowledge base ? (y/N)\n");

    rl.prompt();
    rl.once('line', function (havePKB) {
      manifest.pkb = (havePKB.trim().toLowerCase() === 'y');
      next();
    });
  });

  stack.use(function (manifest, next) {
    if (!manifest.pkb) { return next(); }

    rl.write("If domains are taken from the knowledge base, which column contains them ?\n");
    rl.write("(empty answer to skip)\n");
    rl.prompt();
    rl.once('line', function (field) {
      field = field.trim();
      if (field) {
        manifest['pkb-domains'] = field;
        pkbDomains = true;
      }
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write('What domains should be handled ?');
    if (pkbDomains) { rl.write(' (in addition to those found in the knowledge base)'); }
    rl.write('\n(empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (domain) {
        domain = domain.trim();
        if (!domain) { return next(); }

        manifest.domains.push(domain);
        prompt();
      });
    })();
  });

  stack.use(function (manifest, next) {
    currentCompletion = 'rid';
    rl.write('What identifiers should be recognized (ex: title_id, doi) ?');
    rl.write('\n(tab for autocompletion, empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (rid) {
        rid = rid.trim();
        if (!rid) {
          currentCompletion = '';
          return next();
        }

        manifest.recognize.rid.push(rid);
        prompt();
      });
    })();
  });

  stack.use(function (manifest, next) {
    currentCompletion = 'mime';
    rl.write('What mime types should be recognized (ex: HTML, PDF) ?');
    rl.write('\n(tab for autocompletion, empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (mime) {
        mime = mime.trim();
        if (!mime) {
          currentCompletion = '';
          return next();
        }

        manifest.recognize.mime.push(mime);
        prompt();
      });
    })();
  });

  stack.use(function (manifest, next) {
    currentCompletion = 'rtype';
    rl.write('What kind of consultations should be recognized (ex: ARTICLE, BOOK) ?');
    rl.write('\n(tab for autocompletion, empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (rtype) {
        rtype = rtype.trim();
        if (!rtype) {
          currentCompletion = '';
          return next();
        }

        manifest.recognize.rtype.push(rtype);
        prompt();
      });
    })();
  });

  // Create directories
  stack.use(function (manifest, next) {
    var directories = [
      path.join(rootDir, 'scrapers'),
      path.join(rootDir, 'pkb'),
      path.join(rootDir, 'test'),
      rootDir
    ];

    (function createDir() {
      var dir = directories.pop();
      if (!dir) { return next(); }

      fs.mkdir(dir, function (err) {
        if (err) {
          next(new Error('could not create ' + dir));
        } else {
          createDir();
        }
      });
    })();
  });

  // Create the manifest
  stack.use(function (manifest, next) {
    var manifestPath = path.join(rootDir, 'manifest.json');

    fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), function (err) {
      if (err) {
        next(new Error('could not create ' + manifestPath));
      } else {
        next();
      }
    });
  });

  // Create the parser skeleton
  stack.use(function (manifest, next) {
    var parserPath = path.join(rootDir, 'parser.js');


    fs.readFile(skeletonPath, function (err, skeleton) {
      if (err) {
        return next(new Error('could not read ' + skeletonPath));
      }
      skeleton = skeleton.toString().replace('[description-goes-here]', manifest.describe);

      fs.writeFile(parserPath, skeleton, { mode: parseInt('775', 8) }, function (err) {
        if (err) {
          next(new Error('could not create ' + parserPath));
        } else {
          next();
        }
      });
    });
  });

  // Create an empty test file
  stack.use(function (manifest, next) {
    var filename     = manifest.name + '.' + manifest.version + '.csv';
    var testFilePath = path.join(rootDir, 'test', filename);
    var firstLine    = '';

    manifest.recognize.rid.forEach(function (rid) {
      firstLine += 'out-' + rid + ';';
    });
    if (manifest.recognize.rtype.length) { firstLine += 'out-rtype;'; }
    if (manifest.recognize.mime.length)  { firstLine += 'out-mime;'; }
    firstLine += 'in-url';

    fs.writeFile(testFilePath, firstLine, function (err) {
      if (err) {
        next(new Error('could not create ' + testFilePath));
      } else {
        next();
      }
    });
  });

  stack.use(function (err, manifest, next) {
    console.error(err);
    process.exit(1);
  });

  stack.use(function (manifest, next) {
    rl.write('Skeleton created in ' + rootDir + '\n');
    process.exit(0);
  });

  stack.process({
    version: moment().format('YYYY-MM-DD'),
    status: "beta",
    domains: [],
    recognize: {
      rid: [],
      rtype: [],
      mime: []
    }
  });
};
Example #2
0
 job.middlewares.forEach(function (mw) {
   stack.use(mw);
 });
module.exports = function () {
  var currentCompletion;
  var possibleFields = {};

  var rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout,
    completer: function (line) {
      var i = line.lastIndexOf(' ');
      var wordToComplete;
      if (i === -1) { wordToComplete = line; }
      else          { wordToComplete = line.substr(i + 1); }

      var matches = (possibleFields[currentCompletion] || []).filter(function (field) {
        return field.startsWith(wordToComplete);
      });

      return [matches, wordToComplete];
    }
  });

  rl.setPrompt('> ', 2);

  rl.write('\nThis tool will help you to initialize a platform.\n');
  rl.write('It will create the basic structure for you.\n');
  rl.write('Type "exit" anytime to cancel and leave.\n\n');

  rl.on('line', function(line) {
    if (line == 'exit') {
      rl.write('Cancelled\n');
      process.exit(0);
    }
  });

  var stack = new Stackware();
  var pkbDomains;
  var rootDir;

  // Setup autocompletion
  stack.use(function (manifest, next) {
    fs.readFile(path.join(platformsDir, 'fields.json'), function (err, content) {
      if (err) { return next(); }

      try {
        possibleFields = JSON.parse(content);
        for (let p in possibleFields) {
          possibleFields[p] = possibleFields[p].map(field => field.code);
        }
      } catch (e) {
        console.error('[err: cannot set autocompletion]');
      }

      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write('What\'s the short name of the platform?\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (shortName) {
        rootDir = path.join(__dirname, '../../platforms/', shortName);

        fs.exists(rootDir, function (exist) {
          if (exist) {
            rl.write('This platform already exists!\n');
            return prompt();
          }
          manifest.name = shortName;
          next();
        });
      });
    })();
  });

  stack.use(function (manifest, next) {
    rl.write('What\'s the long name of the platform?\n');
    rl.prompt();
    rl.once('line', function (longName) {
      manifest.longname = longName;
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write('How would you describe this parser?\n');
    rl.prompt();
    rl.once('line', function (description) {
      manifest.describe = description;
      next();
    });
    rl.write('Recognizes the accesses to the platform ' + manifest.longname);
  });

  stack.use(function (manifest, next) {
    rl.write('What\'s the URL of the analyzis?\n');
    rl.prompt();
    rl.once('line', function (url) {
      manifest.docurl = url;
      next();
    });
    rl.write('http://ang.couperin.org/platforms/to_be_completed/');
  });

  stack.use(function (manifest, next) {
    rl.write('Who\'s the contact for this parser?\n');
    rl.prompt();
    rl.once('line', function (contact) {
      manifest.contact = contact;
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write('Does it have a knowledge base? (y/N)\n');

    rl.prompt();
    rl.once('line', function (havePKB) {
      manifest.pkb = (havePKB.trim().toLowerCase() === 'y');
      next();
    });
  });

  stack.use(function (manifest, next) {
    if (!manifest.pkb) { return next(); }

    rl.write('If domains are taken from the knowledge base, which column contains them?\n');
    rl.write('(empty answer to skip)\n');
    rl.prompt();
    rl.once('line', function (field) {
      field = field.trim();
      if (field) {
        manifest['pkb-domains'] = field;
        pkbDomains = true;
      }
      next();
    });
  });

  stack.use(function (manifest, next) {
    rl.write('What fully qualified domain names should be handled?');
    rl.write('eg. www.example.org, test.example.org, etc.)');
    if (pkbDomains) { rl.write(' (in addition to those found in the knowledge base)'); }
    rl.write('\n(empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (domain) {
        domain = domain.trim();
        if (!domain) { return next(); }

        manifest.domains.push(domain);
        prompt();
      });
    })();
  });

  stack.use(function (manifest, next) {
    currentCompletion = 'rid';
    rl.write('What identifiers should be recognized (eg: title_id, doi)?');
    rl.write('\n(tab for autocompletion, empty answer to stop)\n');

    (function prompt() {
      rl.prompt();
      rl.once('line', function (rid) {
        rid = rid.trim();
        if (!rid) {
          currentCompletion = '';
          return next();
        }

        manifest.recognize.rid.push(rid);
        prompt();
      });
    })();
  });

  // Create directories
  stack.use(function (manifest, next) {
    var directories = [
      path.join(rootDir, 'scrapers'),
      path.join(rootDir, 'pkb'),
      path.join(rootDir, 'test'),
      rootDir
    ];

    (function createDir() {
      var dir = directories.pop();
      if (!dir) { return next(); }

      fs.mkdir(dir, function (err) {
        if (err) { return next(new Error('could not create ' + dir)); }
        createDir();
      });
    })();
  });

  // Create the manifest
  stack.use(function (manifest, next) {
    var manifestPath = path.join(rootDir, 'manifest.json');

    fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), function (err) {
      if (err) { return next(new Error('could not create ' + manifestPath)); }
      next();
    });
  });

  // Create the parser skeleton
  stack.use(function (manifest, next) {
    var parserPath = path.join(rootDir, 'parser.js');


    fs.readFile(skeletonPath, function (err, skeleton) {
      if (err) {
        return next(new Error('could not read ' + skeletonPath));
      }
      skeleton = skeleton.toString().replace('[description-goes-here]', manifest.describe);

      fs.writeFile(parserPath, skeleton, { mode: parseInt('775', 8) }, function (err) {
        if (err) { return next(new Error('could not create ' + parserPath)); }
        next();
      });
    });
  });

  // Create an empty test file
  stack.use(function (manifest, next) {
    var filename     = manifest.name + '.' + manifest.version + '.csv';
    var testFilePath = path.join(rootDir, 'test', filename);
    var firstLine    = '';

    manifest.recognize.rid.forEach(function (rid) {
      firstLine += 'out-' + rid + ';';
    });
    firstLine += 'out-rtype;';
    firstLine += 'out-mime;';
    firstLine += 'in-url';

    fs.writeFile(testFilePath, firstLine, function (err) {
      if (err) { return next(new Error('could not create ' + testFilePath)); }
      next();
    });
  });

  stack.use(function (err, manifest, next) {
    throw err;
  });

  stack.use(function (manifest, next) {
    rl.write('Skeleton created in ' + rootDir + '\n');
    process.exit(0);
  });

  stack.process({
    version: moment().format('YYYY-MM-DD'),
    status: 'beta',
    domains: [],
    recognize: {
      rid: []
    }
  });
};
Example #4
0
var LinesProcessor = function (job) {
  var self      = this;
  var ecNumber  = 0;
  var firstLine = true;

  if (!job) { return self.emit('end'); }

  var logParser   = job.logParser;

  var ecOrganizer = new Organizer();
  var stack       = new Stackware();

  job.middlewares.forEach(function (mw) {
    stack.use(mw);
  });

  stack.use(function done(ec) {
    if (!ec) { return; }
    
    if (ec._meta.enhancementFailed) {
      job.logStreams.write('pkb-miss-ecs', ec._meta.originalLine + '\n');
      job.report.inc('rejets', 'nb-lines-pkb-miss-ecs');
    }

    if (ec._meta.granted === false) {
      self.emit('denied', ec);
      ecOrganizer.skip(ec._meta.lineNumber);
    } else {
      ecOrganizer.push(ec);
    }
  });
  
  stack.use(function onError(err, ec, next) {
    ecOrganizer.skip(ec._meta.lineNumber);

    switch (err.type) {
      case 'ECLEAN':
        self.emit('ec', ec);
        break;
      case 'ENOPARSER':
        job.notifiers['unknown-domains'].increment(ec.domain);
        job.logStreams.write('unknown-domains', ec._meta.originalLine + '\n');
        job.report.inc('rejets', 'nb-lines-unknown-domains');
        break;
      case 'ECHRONO':
        job.logger.verbose('A log line is not chronological : ' + ec._meta.originalLine);
        job.report.inc('rejets', 'nb-lines-unordered-ecs');
        job.logStreams.write('unordered-ecs', ec._meta.originalLine + '\n');
        break;
      case 'EDUPLICATE':
        job.report.inc('rejets', 'nb-lines-duplicate-ecs');
        job.logStreams.write('duplicate-ecs', ec._meta.originalLine + '\n');
        break;
      case 'EIRRELEVANT':
        job.report.inc('rejets', 'nb-lines-ignored');
        job.logStreams.write('filtered-ecs', ec._meta.originalLine + '\n');
        break;
      case 'EIGNOREDDOMAIN':
        job.report.inc('rejets', 'nb-lines-ignored-domains');
        job.logStreams.write('ignored-domains', ec._meta.originalLine + '\n');
        break;
      case 'EIGNOREDHOST':
        job.report.inc('rejets', 'nb-lines-ignored-hosts');
        job.logStreams.write('ignored-hosts', ec._meta.originalLine + '\n');
        break;
      case 'EROBOT':
        job.report.inc('rejets', 'nb-lines-robots-ecs');
        job.logStreams.write('robots-ecs', ec._meta.originalLine + '\n');
        break;
      case 'ENOTQUALIFIED':
        if (!ec._meta.denied) {
          job.report.inc('rejets', 'nb-lines-unqualified-ecs');
          job.logStreams.write('unqualified-ecs', ec._meta.originalLine + '\n');
        }
        break;
    }
  });

  ecOrganizer.on('ec', function (ec) {
    job.counterReporter.count(ec);
    self.emit('ec', ec);
    // count masters values for reporting
    if (!job.report.get('stats', 'platform-' + ec.platform)) {
      job.report.inc('stats', 'platforms');
    }
    job.report.inc('stats', 'platform-' + ec.platform);
    if (ec.rtype) { job.report.inc('stats', 'rtype-' + ec.rtype); }
    if (ec.mime)  { job.report.inc('stats', 'mime-' + ec.mime); }
  });

  ecOrganizer.on('drain', function () {
    self.emit('end');
  });

  /**
   * Parse a line and push the resulting EC into the enhancement process (if valid)
   * @param  {String} line
   */
  self.push = function processLine(line) {
    if (firstLine) {
      firstLine = false;
      job.report.set('general', 'input-first-line', line);
    }
    if (job.badBeginning) {
      return;
    }
    job.report.inc('general', 'nb-lines-input');

    line = line.replace(/\r$/, '');
    if (!line) { return; }

    var ec = logParser.parse(line);

    if (!ec) {
      job.logStreams.write('unknown-formats', line + '\n');
      job.report.inc('rejets', 'nb-lines-unknown-formats');
      if (!job.parsedLines) {
        job.badBeginning = true;
        job._stop();
        job.logger.warn('Couldn\'t recognize first line : aborted.', {line: line});
      }
      return;
    }

    if (!job.parsedLines) {
      job.report.set('general', 'input-format-proxy',
        job.logParser.getProxy() || 'none, auto-recognition failed');
      job.report.set('general', 'input-format-literal',
        logParser.getFormat() || 'none, auto-recognition failed');
      job.report.set('general', 'input-format-regex',
        logParser.getRegexp(true) || 'none, bad format given or auto-recognition failed');
      // Add or remove user fields from those extracted by the log parser
      // We can't do it before because we need to process one line to autodetect the format
      job.outputFields.added = job.outputFields.added.concat(job.logParser.getFields());
    }

    job.parsedLines = true;

    ec._meta.originalLine = line;
    ec._meta.lineNumber   = ++ecNumber;

    stack.process(ec);
  };

  self.drain = function () {
    ecOrganizer.setLast(ecNumber);
    stack.process(null);
  };
};