Beispiel #1
0
    request({ url:item.href, encoding: null }, function(error, response, html) {
        if (response.statusCode != 200) {
            html = iconv.decode(html, jschardet.detect(html).encoding);
        console.log("error: ", item.href);
        }
        if(!error) {
            console.log("ok: ", new Date());
            html = iconv.decode(html, jschardet.detect(html).encoding);

            var $ = cheerio.load(html);

            var compreende = $("td[width='95%']").eq(0).text().trim();
            var compreendeTambem = $("td[width='95%']").eq(1).text().trim();
            var naoCompreende = $("td[width='95%']").eq(2).text().trim();



            $("td[width='105']").each(function() {

                var parent = $(this).parent();
                
                var data = parent.children().eq(1).find("a");
                var descricaoEl = parent.children().eq(2);

                var subclasse = {
                    classe: item,
                    codigo: data.text(),
                    descricao: descricaoEl.text().capitalize(),
                    href: options.baseUrl + "pesquisa.asp?TabelaBusca=CNAE_202@CNAE%202.2%20-%20Subclasses@0@cnaefiscal@0&source=subclasse&pesquisa=" + data.text().replace("-","").replace("/",""),
                    compreende: capitalize(compreende),
                    compreendeTambem : capitalize(compreendeTambem),
                    naoCompreende: capitalize(naoCompreende),
                    atividades: []
                };

                // console.log(subclasse);

                item.subclasses.push(subclasse);
                
                scrapAtividades(subclasse);

                // var subclasseCnae = ({
                //   _id: subclasse.codigo,
                //   Secao: item.grupo.divisao.secao.codigo,
                //   Divisao: item.grupo.divisao.codigo,
                //   Grupo: item.grupo.codigo,
                //   Classe: item.codigo,
                //   Subclasse: subclasse.codigo,
                //   Descricao: capitalize(subclasse.descricao)
                // });
            });
        }
        else
        {
            
            console.log("error subclasses", error);
            
        }
    });
Beispiel #2
0
		.pipe(es.wait(function(err, data) {
			if (err) return;
			var body;
			var type;
			var size = req.response.headers["content-length"];
			var org = data;
			if (!binaryflag){
				try {
					var detectResult = jschardet.detect(data) || {};
					var iconv = new Iconv(detectResult.encoding, "UTF-8//TRANSLIT//IGNORE");
					data = iconv.convert(data).toString();
				} catch (e) {
					data = org;
				}
			}

			try {
				body = JSON.parse(data);
			} catch (e) {
				body = {};
			}
			try {
				type = req.response.headers["content-type"].split(/ *; */).shift();
			} catch (e) {
				type = {};
			}
			data = {
				text: data,
				body: body,
				type: type,
				size: size
			};
			cb(data);
		}));
Beispiel #3
0
	function trans(file){
		var output = {},
			varName = '',
			fileName = '';
			encode = jschardet.detect(file.contents).encoding,
			fileBasePath = path.basename(file.path),
			filecontentTemp = iconv.decode(file.contents, encode);

        // 文件名
        fileName = filecontentTemp.match(/<!--filename:(\w+)-->/);
        try{
            output.fileName = fileName[1];
        }catch(e){

            output.fileName = getFileName(fileBasePath);
        }

        //js变量名
        varName = filecontentTemp.match(/<!--varname:(\w+)-->/);
        try{
            output.varname = varName[1];
        }catch(e){
            output.varname = output.fileName;
        }

        filecontentTemp = html2js(filecontentTemp);
     	if(options.type=='amd' || options.type=='cmd' || options.type=='fmd'){
            var amdId  = options.modBase?options.modBase+'/'+output.fileName:output.fileName;
            output.fileContent = 'define(function () {\n  return \''+filecontentTemp+'\'\n});';
	    }else{
	        output.fileContent = output.varname+'=\'' + filecontentTemp + '\';';
	    }
        return output;
	}
		// Handles charset encoding
		function decode(dataBuff, language, callback) {
			var charsetDetect = require('jschardet');
			var targetEncodingCharset = 'utf8';

			var charset = charsetDetect.detect(dataBuff);
			var detectedEncoding = charset.encoding;
			win.debug("SUB charset detected: "+detectedEncoding);
			// Do we need decoding?
			if (detectedEncoding.toLowerCase().replace('-','') == targetEncodingCharset) {
				callback(dataBuff.toString('utf-8'));
			// We do
			} else {
				var iconv = require('iconv-lite');
				var langInfo = App.Localization.langcodes[language] || {};
				win.debug("SUB charset expected: "+langInfo.encoding);
				if (langInfo.encoding !== undefined && langInfo.encoding.indexOf(detectedEncoding) < 0) {
					// The detected encoding was unexepected to the language, so we'll use the most common
					// encoding for that language instead.
					detectedEncoding = langInfo.encoding[0];
				}
				win.debug("SUB charset used: "+detectedEncoding);
				dataBuff = iconv.encode( iconv.decode(dataBuff, detectedEncoding), targetEncodingCharset );
				callback(dataBuff.toString('utf-8'));
			}
		}
Beispiel #5
0
module.exports = function (contents, options) {
  options = options || {};

  if (isUtf8(contents)) {
    return contents;
  }

  var encInfo = jschardet.detect(contents);
  var encFrom = encInfo.encoding;

  switch (encInfo.encoding) {
    case 'UTF-16LE':
      encFrom = 'utf16-le';
      break;
    default:
      encFrom = 'win1256';
  }

  try {
    var decoded = iconv.decode(contents, encFrom);
    contents = iconv.encode(decoded, 'utf8');
  } catch (e) {
    console.log('Conversion Failed: ' + e);
  }

  return contents;
};
Beispiel #6
0
    charsetDetect: function( filename ){

        filename = Path.resolve( filename );
        var detResult = JsCharDet.detect( Fs.readFileSync( filename ) );

        if( detResult.confidence < 0.5 ){
            return undefined;
        }

        var charset = detResult.encoding;

        if( charset.match( /ascii/i ) ){
            return undefined;
        }

        if( charset.match( /^gb.*/i ) ){
            charset = 'gbk';
        }

        if( charset.match( /utf8|utf-8/i) ){
            charset = 'utf-8';
        }

        return charset;
    },
Beispiel #7
0
                        zipEntries.forEach(function(zipEntry, key) {
                            if (zipEntry.entryName.indexOf('.srt') != -1) {
                                var decompressedData = zip.readFile(zipEntry); // decompressed buffer of the entry
                                var charset = charsetDetect.detect(decompressedData);
                                if (charset.encoding == targetEncodingCharset || charset.encoding == targetCharset) {
                                    fs.writeFile( subOutputFile, decompressedData);
                                }
                                else {
                                    var iconv = require('iconv-lite');
                                    // Windows-1251/2/IBM855 works fine when read from a file (like it's UTF-8), but if you try to convert it you'll ruin the encoding.
                                    // Just save it again, and it'll be stored as UTF-8. At least on Windows.

                                    if( charset.encoding == 'IBM855' ) {
                                        // If you're wondering "What the f**k is this shit?", there's a bug with the charset detector when using portuguese or romanian. It's actually ISO-8859-1.
                                        decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-1'), targetEncodingCharset );
                                    } 
                                    else if( charset.encoding == 'windows-1251' || charset.encoding == 'windows-1252' ) {
                                        // It's the charset detector f*****g up again, now with Spanish, Portuguese and Romanian
                                        if( subOutputFile.indexOf('romanian.srt') > 0 ) {
                                            // And if it's romanian, it's iso-8859-2
                                            decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-2'), targetEncodingCharset );
                                        } 
                                        else {
                                            decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-1'), targetEncodingCharset );
                                        }
                                    }
                                    else {
                                        decompressedData = iconv.encode( iconv.decode(decompressedData, charset.encoding), targetEncodingCharset );
                                    }
                                    
                                    fs.writeFile( subOutputFile, decompressedData);
                                }
                            }
                        });
Beispiel #8
0
const getCharset = function(contentType, body) {
	let detectedCharset;
	let httpHeaderCharset;
	let htmlMetaCharset;
	let result;

	contentType = contentType || '';

	const binary = body.toString('binary');
	const detected = jschardet.detect(binary);
	if (detected.confidence > 0.8) {
		detectedCharset = detected.encoding.toLowerCase();
	}
	const m1 = contentType.match(/charset=([\w\-]+)/i);
	if (m1) {
		httpHeaderCharset = m1[1].toLowerCase();
	}
	const m2 = binary.match(/<meta\b[^>]*charset=["']?([\w\-]+)/i);
	if (m2) {
		htmlMetaCharset = m2[1].toLowerCase();
	}
	if (detectedCharset) {
		if (detectedCharset === httpHeaderCharset) {
			result = httpHeaderCharset;
		} else if (detectedCharset === htmlMetaCharset) {
			result = htmlMetaCharset;
		}
	}
	if (!result) {
		result = httpHeaderCharset || htmlMetaCharset || detectedCharset;
	}
	return result || 'utf-8';
};
Beispiel #9
0
  /**
   * 判断指定buffer对象的字符编码
   * ref: https://github.com/LeoYuan/leoyuan.github.io/issues/25
   * @param buffer
   * @param options
   *  - defaultEncoding 指定默认编码集
   *  - minConfidence   指定可接受的最小confidence,如果判断结果小于此值,则用defaultEncoding
   *  - verbose         返回更加详细的字符编码数据
   * @returns {*}
   */
  detectEncoding(buffer, options) {
    options = options || {};
    buffer = buffer || Buffer('');

    var DEFAULT_ENCODING = 'GBK', MIN_CONFIDENCE = 0.96;
    var verbose = options.verbose;
    var defaultEncoding = options.defaultEncoding || DEFAULT_ENCODING;
    var minConfidence = options.minConfidence || MIN_CONFIDENCE;
    var ret = jschardet.detect(buffer), encoding = ret.encoding === 'ascii' ? 'utf-8' : ret.encoding,
        confidence = ret.confidence;
    // var VALID_ENCODINGS = ['gb2312', 'gbk', 'utf-8', 'big5', 'euc-kr','euc-jp'];

    if (encoding === null || !iconv.encodingExists(encoding) || confidence < minConfidence) {
        return verbose ? {
            encoding: defaultEncoding,
            oriEncoding: encoding,
            confidence: confidence
        } : defaultEncoding;
    } else {
        encoding = encoding.toUpperCase();
        return verbose ? {
            encoding: encoding,
            oriEncoding: encoding,
            confidence: confidence
        } : encoding;
    }
  }
/**
 * detect html-encoding (use jschardet)
 * @param buffer body(buffer)
 * @return encoding-charset or undefined
 */
function _detectEncodingByBuffer(buffer) {
  var enc = jschardet.detect(buffer);
  if (enc && enc.encoding && (enc.confidence || 0) >= 0.99) {
    return enc.encoding;
  }
  return undefined;
}
Beispiel #11
0
exports.decodeBuffer = function (buf) {
  var MIN_CONFIDENCE = 0.96,
    DEFAULT_ENCODING = 'GB2312';

  if (!buf) {
    return false;
  }

  var iconv = require('iconv-lite');
  var jschardet = require('jschardet');

  var
    detectResult = jschardet.detect(buf),
    encoding = detectResult.encoding;

  if (detectResult.confidence < MIN_CONFIDENCE) {
    encoding = DEFAULT_ENCODING;
  }

  if (!encoding) {
    return false;
  }
  // fix ascii bug
  encoding = encoding === 'ascii' ? 'utf8' : encoding;
  return [iconv.decode(buf, encoding), encoding];
};
    // Handles charset encoding
    function decode(dataBuff, language, callback) {
      var charsetDetect = require('jschardet');
      var targetCharset = 'utf-8';
      var targetEncodingCharset = 'utf8';

      var charset = charsetDetect.detect(dataBuff);
      var detectedEncoding = charset.encoding;

      // Do we need decoding?
      if (detectedEncoding == targetEncodingCharset || detectedEncoding == targetCharset) {
        callback(dataBuff.toString('utf-8'));

      // We do
      } else {
        var iconv = require('iconv-lite');
        // Windows-1251/2/IBM855 works fine when read from a file (like it's UTF-8), but if you try to convert it you'll ruin the encoding.
        // Just save it again, and it'll be stored as UTF-8. At least on Windows.

        if ( detectedEncoding == 'IBM855' || detectedEncoding == 'windows-1250' || detectedEncoding == 'windows-1251' || detectedEncoding == 'windows-1252' || detectedEncoding == 'windows-1255' || detectedEncoding == 'windows-1254' ) {
          // It's the charset detector f*****g up again
          var langInfo = {};
          var expected = langInfo.encoding;
          if (expected && expected.indexOf(detectedEncoding) < 0) {
            // The detected encoding was unexepected to the language, so we'll use the most common
            // encoding for that language instead.
            detectedEncoding = expected[0];
          }
        }

        dataBuff = iconv.encode( iconv.decode(dataBuff, detectedEncoding), targetEncodingCharset );
        callback(dataBuff.toString('utf-8'));
      }
    }
Beispiel #13
0
var convert = function(text) {
  var detected = jschardet.detect(text);
  console.log('#detected:' + detected.encoding);
  var iconv = new Iconv(detected.encoding,'UTF-8//TRANSLIT//IGNORE');
  text = iconv.convert(text).toString();
  return text;
}
Beispiel #14
0
  // peut être déplacé à la fin du fichier
  function csvCharsetChecking(file, cb) {
    var minCheckConfidenceNeeded = 0.93;
    var jschardetCheck = jschardet.detect(fs.readFileSync(file));

    if (fs.existsSync(file) && file.match(/.txt$/)) {
      // [ProviderName]_[Region/Consortium]_[PackageName]_[YYYY-MM-DD].txt
      if (jschardetCheck.encoding) {
        if (jschardetCheck.encoding === 'windows-1252'
            && jschardetCheck.confidence > minCheckConfidenceNeeded) {
          if (argv.verbose) {
            console.error("Encoding warning : " +
            file + " : have to be UTF-8 encoded (" + jschardetCheck.encoding + " detected with " +
             jschardetCheck.confidence + " confidence). Check that title should not have accents");
          }
        } else if (! (jschardetCheck.encoding === 'utf-8'
                    || jschardetCheck.encoding === 'ascii')) {
          console.error("Encoding error : " +
          file + " : have to be UTF-8 encoded (" + jschardetCheck.encoding + " detected)");
        }
      }
      files.push(file);
      cb(null, file);
    } else {
      console.error(file + " : seems not to be a pkb file (no .txt extension)");
      cb(null, file);
    }
  }
      Q.all(results).done(function(bodies) {

        var charsets = [
          chardet.detect(bodies[0]).encoding,
          chardet.detect(bodies[1]).encoding,
        ]

        // We wanted to decode our first stream.
        charsets[0].should.equal('ascii');
        bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1);

        // But not our second stream.
        charsets[1].should.equal('windows-1252');
        bodies[1].indexOf('全球中文网站前二十强').should.equal(-1);

        done();
      });
Beispiel #16
0
 detectByBuffer: function (buffer) {
   var enc = jschardet.detect(buffer);
   // 高精度で判定できた場合のみ
   if (enc && enc.encoding && (enc.confidence || 0) >= 0.99) {
     return enc.encoding;
   }
   return undefined;
 },
Beispiel #17
0
 encFiles.forEach(function(file) {
     var detected = jschardet.detect(file.content).encoding.toLowerCase();
     var enc = file.file.split('_')[0];
     if(detected=='iso-8859-2') {
         console.log("expected failure on '"+file.file+"'", detected, enc)
     } else {
         expect(detected).to.eql(enc);
     }
 });
Beispiel #18
0
}).forEach(function(f){
    //强行转换编码为utf8
    var rfs = fs.readFileSync('./'+f),encoding = jschardet.detect(rfs).encoding;
    if(encoding!=='utf-8')
    {
        fs.writeFileSync('./'+f,iconv.encode(iconv.decode(rfs,encoding),'utf-8'));
    }
    cvs2json(f.replace('.csv',''));
})
Beispiel #19
0
 subOutput.on('finish', function() {
     var subText = fs.readFileSync(this.path, 'binary');
     var charset = charsetDetect.detect(subText);
     if( charset.encoding != targetCharset ) {
         var iconv = require('iconv-lite');
         subText = iconv.encode( iconv.decode(subText, charset.encoding), targetCharset );
         fs.writeFile( this.path, subText );
     }
 });
Beispiel #20
0
 fs.readFile( filePath, function( error, data ) {
   var encoding, decoded;
   if ( error ) {
     cb( error, null );
     return;
   }
   encoding = jschardet.detect( data ).encoding;
   encoding = encoding ? encoding.toLowerCase() : 'utf8'
   decoded = iconv.decode( data, encoding );
   cb( null, decoded );
 });
Beispiel #21
0
common.detectIsTextFile = function(filePath) {
  var fd = fs.openSync(filePath, 'r');
  var buffer = new Buffer(4096);
  fs.readSync(fd, buffer, 0, 4096, 0);
  fs.closeSync(fd);
  var rs = jschardet.detect(buffer);
  log.debug('detectIsTextFile:', filePath, rs);
  if (rs.confidence == 1) {
    return true;
  }
  return false;
}
Beispiel #22
0
    request({ url:item.href, encoding: null }, function(error, response, html) {
        if(!error) {
            html = iconv.decode(html, jschardet.detect(html).encoding);

            var $ = cheerio.load(html);

            $("td[width='100']").each(function() {

                var parent = $(this).parent();

                var data = parent.children().eq(1).find("a");
                var descricaoEl = parent.children().eq(2);

                var ccodigo = data.text();
                ccodigo = ccodigo.substr(1, 2) + '.' + ccodigo.substr(2,4);

                var classe = {
                    grupo: item,
                    codigo: ccodigo,
                    descricao: descricaoEl.text().capitalize(),
                    href: options.baseUrl + data.attr("href"),
                    subclasses: []
                };

                if (classe.codigo.length == 7) {
                    item.classes.push(classe);
                    scrapSubclasses(classe);

                    var classeCnae = new Cnae({
                      _id: classe.codigo,
                      Secao: item.divisao.secao.codigo,
                      Divisao: item.divisao.codigo,
                      Grupo: item.codigo,
                      Classe: classe.codigo,
                      Subclasse: 0,
                      Descricao: capitalize(classe.descricao)
                    });

                    // classeCnae.save(function(err, ret) {
                    //   if (err) return console.error(err);
                    // });

                    console.log(classe.grupo.divisao.secao.codigo + " " +
                                classe.grupo.divisao.codigo + " " +
                                classe.grupo.codigo + " " +
                                classe.codigo + " " +
                                classe.descricao);
                }
            });
        }
    });
Beispiel #23
0
Client.prototype.convertEncoding = function (str) {
    // No Encoding, bail
    if (!this.opt.encoding) return str;
    try {
        // Detect the input charset
        const charset = charsetDetector.detect(str);
        // Determine if we think we have a good match, if not default to specified encoding
        const encodeTo = charset.confidence >= 1 ? charset.encoding : this.opt.encoding;
        // Return a string in the format specified in the configuration
        return iconv.decode(iconv.encode(str, encodeTo), this.opt.encoding);
    } catch (err) {
        if (this.opt.debug) console.log('\u001b[01;31mERROR: ' + err + '\u001b[0m');
    }
};
Beispiel #24
0
module.exports = function decode(text, charset) {

  if(!charset) {
    var encoding = jschardet.detect(text).encoding;
    charset = encoding || defaultEncoding;
  }

  if(!iconv.encodingExists(charset) || charset === defaultEncoding) {
    return he.decode(text.toString(defaultEncoding));
  }

  return he.decode(iconv.decode(text, charset));

};
      .then((data) => {
        const encoding = {
          'ascii': 'cp866',
          'UTF-8': 'utf8',
          'windows-1251': 'win1251',
        };

        let dataEncoding = detect(data).encoding;
        if (!dataEncoding || !encoding[dataEncoding]) {
          dataEncoding = process.platform.match(/win/) ? 'win1251' : 'utf8';
        }
        const decoded = iconv.decode(data, dataEncoding);
        resolve(decoded);
      })
  }, function (err, res, body) {
    if (err)
      console.error(err);

    var encoding = jschardet.detect(body).encoding;
    if (encoding !== "ascii" && encoding !== "utf-8") {
      var iconv = new Iconv(encoding, "UTF-8//TRANSLIT//IGNORE");
      body = iconv.convert(body);
    }

    var $ = cheerio.load(body);
    var pageTitle = $("title").text();

    console.log(pageTitle);
  });
Beispiel #27
0
 fs.readFile( filePath, function( error, data ) {
   var encoding, decoded;
   if ( error ) {
     cb( error, null );
     return;
   }
   try {
     encoding = jschardet.detect( data ).encoding.toLowerCase();
     decoded = iconv.decode( data, encoding );
   } catch ( e ) {
     cb( e );
     return;
   }
   cb( null, decoded );
 });
Beispiel #28
0
module.exports = function encode(text, charset) {

  if(!charset) {
    var encoding = jschardet.detect(text).encoding;
    if(encoding) {
      charset = encoding;
    }
  }

  if(!iconv.encodingExists(charset) || charset === 'utf-8') {
    return he.decode(text.toString('utf-8'));
  }

  return he.decode(iconv.decode(text, charset));

}
Beispiel #29
0
Downloader.responseDecode = function (response, body) {
    if (!response) {
        throw new Error('Пустой response. ');
    }
    if (!body) {
        throw new Error('Пустой body. ');
    }
    var enc = charset(response.headers, body);
    enc = enc || jschardet.detect(body).encoding;
    if (enc) {
        enc = enc.toLowerCase();
        return he.decode(iconv.decode(body, enc));
    } else {
        throw new Error('Unknown encoding ');
    }
}
Beispiel #30
0
    request({ url:item.href, encoding: null }, function(error, response, html) {
        if(!error) {
            html = iconv.decode(html, jschardet.detect(html).encoding);

            var $ = cheerio.load(html);

            $("td[width='105']").each(function() {

                var parent = $(this).parent();

                var data = parent.children().eq(1).find("a");
                var descricaoEl = parent.children().eq(2);

                var divisao = {
                    secao: item,
                    codigo: data.text(),
                    descricao: descricaoEl.text().capitalize(),
                    href: options.baseUrl + data.attr("href"),
                    grupos: []
                };

                if (divisao.codigo.length == 2) {
                    item.divisoes.push(divisao);
                    scrapGrupos(divisao);

                    var divisaoCnae = new Cnae({
                      _id: divisao.codigo,
                      Secao: item.codigo,
                      Divisao: divisao.codigo,
                      Grupo: 0,
                      Classe: 0,
                      Subclasse: 0,
                      Descricao: capitalize(divisao.descricao)
                    });

                    // divisaoCnae.save(function(err, ret) {
                    //   if (err) return console.error(err);
                    // });

                    console.log(divisao.secao.codigo + " " +
                                divisao.codigo + " " +
                                divisao.descricao);
                }

            });
        }
    });