request({ url:item.href, encoding: null }, function(error, response, html) { if (response.statusCode != 200) { html = iconv.decode(html, jschardet.detect(html).encoding); console.log("error: ", item.href); } if(!error) { console.log("ok: ", new Date()); html = iconv.decode(html, jschardet.detect(html).encoding); var $ = cheerio.load(html); var compreende = $("td[width='95%']").eq(0).text().trim(); var compreendeTambem = $("td[width='95%']").eq(1).text().trim(); var naoCompreende = $("td[width='95%']").eq(2).text().trim(); $("td[width='105']").each(function() { var parent = $(this).parent(); var data = parent.children().eq(1).find("a"); var descricaoEl = parent.children().eq(2); var subclasse = { classe: item, codigo: data.text(), descricao: descricaoEl.text().capitalize(), href: options.baseUrl + "pesquisa.asp?TabelaBusca=CNAE_202@CNAE%202.2%20-%20Subclasses@0@cnaefiscal@0&source=subclasse&pesquisa=" + data.text().replace("-","").replace("/",""), compreende: capitalize(compreende), compreendeTambem : capitalize(compreendeTambem), naoCompreende: capitalize(naoCompreende), atividades: [] }; // console.log(subclasse); item.subclasses.push(subclasse); scrapAtividades(subclasse); // var subclasseCnae = ({ // _id: subclasse.codigo, // Secao: item.grupo.divisao.secao.codigo, // Divisao: item.grupo.divisao.codigo, // Grupo: item.grupo.codigo, // Classe: item.codigo, // Subclasse: subclasse.codigo, // Descricao: capitalize(subclasse.descricao) // }); }); } else { console.log("error subclasses", error); } });
.pipe(es.wait(function(err, data) { if (err) return; var body; var type; var size = req.response.headers["content-length"]; var org = data; if (!binaryflag){ try { var detectResult = jschardet.detect(data) || {}; var iconv = new Iconv(detectResult.encoding, "UTF-8//TRANSLIT//IGNORE"); data = iconv.convert(data).toString(); } catch (e) { data = org; } } try { body = JSON.parse(data); } catch (e) { body = {}; } try { type = req.response.headers["content-type"].split(/ *; */).shift(); } catch (e) { type = {}; } data = { text: data, body: body, type: type, size: size }; cb(data); }));
function trans(file){ var output = {}, varName = '', fileName = ''; encode = jschardet.detect(file.contents).encoding, fileBasePath = path.basename(file.path), filecontentTemp = iconv.decode(file.contents, encode); // 文件名 fileName = filecontentTemp.match(/<!--filename:(\w+)-->/); try{ output.fileName = fileName[1]; }catch(e){ output.fileName = getFileName(fileBasePath); } //js变量名 varName = filecontentTemp.match(/<!--varname:(\w+)-->/); try{ output.varname = varName[1]; }catch(e){ output.varname = output.fileName; } filecontentTemp = html2js(filecontentTemp); if(options.type=='amd' || options.type=='cmd' || options.type=='fmd'){ var amdId = options.modBase?options.modBase+'/'+output.fileName:output.fileName; output.fileContent = 'define(function () {\n return \''+filecontentTemp+'\'\n});'; }else{ output.fileContent = output.varname+'=\'' + filecontentTemp + '\';'; } return output; }
// Handles charset encoding function decode(dataBuff, language, callback) { var charsetDetect = require('jschardet'); var targetEncodingCharset = 'utf8'; var charset = charsetDetect.detect(dataBuff); var detectedEncoding = charset.encoding; win.debug("SUB charset detected: "+detectedEncoding); // Do we need decoding? if (detectedEncoding.toLowerCase().replace('-','') == targetEncodingCharset) { callback(dataBuff.toString('utf-8')); // We do } else { var iconv = require('iconv-lite'); var langInfo = App.Localization.langcodes[language] || {}; win.debug("SUB charset expected: "+langInfo.encoding); if (langInfo.encoding !== undefined && langInfo.encoding.indexOf(detectedEncoding) < 0) { // The detected encoding was unexepected to the language, so we'll use the most common // encoding for that language instead. detectedEncoding = langInfo.encoding[0]; } win.debug("SUB charset used: "+detectedEncoding); dataBuff = iconv.encode( iconv.decode(dataBuff, detectedEncoding), targetEncodingCharset ); callback(dataBuff.toString('utf-8')); } }
module.exports = function (contents, options) { options = options || {}; if (isUtf8(contents)) { return contents; } var encInfo = jschardet.detect(contents); var encFrom = encInfo.encoding; switch (encInfo.encoding) { case 'UTF-16LE': encFrom = 'utf16-le'; break; default: encFrom = 'win1256'; } try { var decoded = iconv.decode(contents, encFrom); contents = iconv.encode(decoded, 'utf8'); } catch (e) { console.log('Conversion Failed: ' + e); } return contents; };
charsetDetect: function( filename ){ filename = Path.resolve( filename ); var detResult = JsCharDet.detect( Fs.readFileSync( filename ) ); if( detResult.confidence < 0.5 ){ return undefined; } var charset = detResult.encoding; if( charset.match( /ascii/i ) ){ return undefined; } if( charset.match( /^gb.*/i ) ){ charset = 'gbk'; } if( charset.match( /utf8|utf-8/i) ){ charset = 'utf-8'; } return charset; },
zipEntries.forEach(function(zipEntry, key) { if (zipEntry.entryName.indexOf('.srt') != -1) { var decompressedData = zip.readFile(zipEntry); // decompressed buffer of the entry var charset = charsetDetect.detect(decompressedData); if (charset.encoding == targetEncodingCharset || charset.encoding == targetCharset) { fs.writeFile( subOutputFile, decompressedData); } else { var iconv = require('iconv-lite'); // Windows-1251/2/IBM855 works fine when read from a file (like it's UTF-8), but if you try to convert it you'll ruin the encoding. // Just save it again, and it'll be stored as UTF-8. At least on Windows. if( charset.encoding == 'IBM855' ) { // If you're wondering "What the f**k is this shit?", there's a bug with the charset detector when using portuguese or romanian. It's actually ISO-8859-1. decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-1'), targetEncodingCharset ); } else if( charset.encoding == 'windows-1251' || charset.encoding == 'windows-1252' ) { // It's the charset detector f*****g up again, now with Spanish, Portuguese and Romanian if( subOutputFile.indexOf('romanian.srt') > 0 ) { // And if it's romanian, it's iso-8859-2 decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-2'), targetEncodingCharset ); } else { decompressedData = iconv.encode( iconv.decode(decompressedData, 'iso-8859-1'), targetEncodingCharset ); } } else { decompressedData = iconv.encode( iconv.decode(decompressedData, charset.encoding), targetEncodingCharset ); } fs.writeFile( subOutputFile, decompressedData); } } });
const getCharset = function(contentType, body) { let detectedCharset; let httpHeaderCharset; let htmlMetaCharset; let result; contentType = contentType || ''; const binary = body.toString('binary'); const detected = jschardet.detect(binary); if (detected.confidence > 0.8) { detectedCharset = detected.encoding.toLowerCase(); } const m1 = contentType.match(/charset=([\w\-]+)/i); if (m1) { httpHeaderCharset = m1[1].toLowerCase(); } const m2 = binary.match(/<meta\b[^>]*charset=["']?([\w\-]+)/i); if (m2) { htmlMetaCharset = m2[1].toLowerCase(); } if (detectedCharset) { if (detectedCharset === httpHeaderCharset) { result = httpHeaderCharset; } else if (detectedCharset === htmlMetaCharset) { result = htmlMetaCharset; } } if (!result) { result = httpHeaderCharset || htmlMetaCharset || detectedCharset; } return result || 'utf-8'; };
/** * 判断指定buffer对象的字符编码 * ref: https://github.com/LeoYuan/leoyuan.github.io/issues/25 * @param buffer * @param options * - defaultEncoding 指定默认编码集 * - minConfidence 指定可接受的最小confidence,如果判断结果小于此值,则用defaultEncoding * - verbose 返回更加详细的字符编码数据 * @returns {*} */ detectEncoding(buffer, options) { options = options || {}; buffer = buffer || Buffer(''); var DEFAULT_ENCODING = 'GBK', MIN_CONFIDENCE = 0.96; var verbose = options.verbose; var defaultEncoding = options.defaultEncoding || DEFAULT_ENCODING; var minConfidence = options.minConfidence || MIN_CONFIDENCE; var ret = jschardet.detect(buffer), encoding = ret.encoding === 'ascii' ? 'utf-8' : ret.encoding, confidence = ret.confidence; // var VALID_ENCODINGS = ['gb2312', 'gbk', 'utf-8', 'big5', 'euc-kr','euc-jp']; if (encoding === null || !iconv.encodingExists(encoding) || confidence < minConfidence) { return verbose ? { encoding: defaultEncoding, oriEncoding: encoding, confidence: confidence } : defaultEncoding; } else { encoding = encoding.toUpperCase(); return verbose ? { encoding: encoding, oriEncoding: encoding, confidence: confidence } : encoding; } }
/** * detect html-encoding (use jschardet) * @param buffer body(buffer) * @return encoding-charset or undefined */ function _detectEncodingByBuffer(buffer) { var enc = jschardet.detect(buffer); if (enc && enc.encoding && (enc.confidence || 0) >= 0.99) { return enc.encoding; } return undefined; }
exports.decodeBuffer = function (buf) { var MIN_CONFIDENCE = 0.96, DEFAULT_ENCODING = 'GB2312'; if (!buf) { return false; } var iconv = require('iconv-lite'); var jschardet = require('jschardet'); var detectResult = jschardet.detect(buf), encoding = detectResult.encoding; if (detectResult.confidence < MIN_CONFIDENCE) { encoding = DEFAULT_ENCODING; } if (!encoding) { return false; } // fix ascii bug encoding = encoding === 'ascii' ? 'utf8' : encoding; return [iconv.decode(buf, encoding), encoding]; };
// Handles charset encoding function decode(dataBuff, language, callback) { var charsetDetect = require('jschardet'); var targetCharset = 'utf-8'; var targetEncodingCharset = 'utf8'; var charset = charsetDetect.detect(dataBuff); var detectedEncoding = charset.encoding; // Do we need decoding? if (detectedEncoding == targetEncodingCharset || detectedEncoding == targetCharset) { callback(dataBuff.toString('utf-8')); // We do } else { var iconv = require('iconv-lite'); // Windows-1251/2/IBM855 works fine when read from a file (like it's UTF-8), but if you try to convert it you'll ruin the encoding. // Just save it again, and it'll be stored as UTF-8. At least on Windows. if ( detectedEncoding == 'IBM855' || detectedEncoding == 'windows-1250' || detectedEncoding == 'windows-1251' || detectedEncoding == 'windows-1252' || detectedEncoding == 'windows-1255' || detectedEncoding == 'windows-1254' ) { // It's the charset detector f*****g up again var langInfo = {}; var expected = langInfo.encoding; if (expected && expected.indexOf(detectedEncoding) < 0) { // The detected encoding was unexepected to the language, so we'll use the most common // encoding for that language instead. detectedEncoding = expected[0]; } } dataBuff = iconv.encode( iconv.decode(dataBuff, detectedEncoding), targetEncodingCharset ); callback(dataBuff.toString('utf-8')); } }
var convert = function(text) { var detected = jschardet.detect(text); console.log('#detected:' + detected.encoding); var iconv = new Iconv(detected.encoding,'UTF-8//TRANSLIT//IGNORE'); text = iconv.convert(text).toString(); return text; }
// peut être déplacé à la fin du fichier function csvCharsetChecking(file, cb) { var minCheckConfidenceNeeded = 0.93; var jschardetCheck = jschardet.detect(fs.readFileSync(file)); if (fs.existsSync(file) && file.match(/.txt$/)) { // [ProviderName]_[Region/Consortium]_[PackageName]_[YYYY-MM-DD].txt if (jschardetCheck.encoding) { if (jschardetCheck.encoding === 'windows-1252' && jschardetCheck.confidence > minCheckConfidenceNeeded) { if (argv.verbose) { console.error("Encoding warning : " + file + " : have to be UTF-8 encoded (" + jschardetCheck.encoding + " detected with " + jschardetCheck.confidence + " confidence). Check that title should not have accents"); } } else if (! (jschardetCheck.encoding === 'utf-8' || jschardetCheck.encoding === 'ascii')) { console.error("Encoding error : " + file + " : have to be UTF-8 encoded (" + jschardetCheck.encoding + " detected)"); } } files.push(file); cb(null, file); } else { console.error(file + " : seems not to be a pkb file (no .txt extension)"); cb(null, file); } }
Q.all(results).done(function(bodies) { var charsets = [ chardet.detect(bodies[0]).encoding, chardet.detect(bodies[1]).encoding, ] // We wanted to decode our first stream. charsets[0].should.equal('ascii'); bodies[0].indexOf('全球中文网站前二十强').should.not.equal(-1); // But not our second stream. charsets[1].should.equal('windows-1252'); bodies[1].indexOf('全球中文网站前二十强').should.equal(-1); done(); });
detectByBuffer: function (buffer) { var enc = jschardet.detect(buffer); // 高精度で判定できた場合のみ if (enc && enc.encoding && (enc.confidence || 0) >= 0.99) { return enc.encoding; } return undefined; },
encFiles.forEach(function(file) { var detected = jschardet.detect(file.content).encoding.toLowerCase(); var enc = file.file.split('_')[0]; if(detected=='iso-8859-2') { console.log("expected failure on '"+file.file+"'", detected, enc) } else { expect(detected).to.eql(enc); } });
}).forEach(function(f){ //强行转换编码为utf8 var rfs = fs.readFileSync('./'+f),encoding = jschardet.detect(rfs).encoding; if(encoding!=='utf-8') { fs.writeFileSync('./'+f,iconv.encode(iconv.decode(rfs,encoding),'utf-8')); } cvs2json(f.replace('.csv','')); })
subOutput.on('finish', function() { var subText = fs.readFileSync(this.path, 'binary'); var charset = charsetDetect.detect(subText); if( charset.encoding != targetCharset ) { var iconv = require('iconv-lite'); subText = iconv.encode( iconv.decode(subText, charset.encoding), targetCharset ); fs.writeFile( this.path, subText ); } });
fs.readFile( filePath, function( error, data ) { var encoding, decoded; if ( error ) { cb( error, null ); return; } encoding = jschardet.detect( data ).encoding; encoding = encoding ? encoding.toLowerCase() : 'utf8' decoded = iconv.decode( data, encoding ); cb( null, decoded ); });
common.detectIsTextFile = function(filePath) { var fd = fs.openSync(filePath, 'r'); var buffer = new Buffer(4096); fs.readSync(fd, buffer, 0, 4096, 0); fs.closeSync(fd); var rs = jschardet.detect(buffer); log.debug('detectIsTextFile:', filePath, rs); if (rs.confidence == 1) { return true; } return false; }
request({ url:item.href, encoding: null }, function(error, response, html) { if(!error) { html = iconv.decode(html, jschardet.detect(html).encoding); var $ = cheerio.load(html); $("td[width='100']").each(function() { var parent = $(this).parent(); var data = parent.children().eq(1).find("a"); var descricaoEl = parent.children().eq(2); var ccodigo = data.text(); ccodigo = ccodigo.substr(1, 2) + '.' + ccodigo.substr(2,4); var classe = { grupo: item, codigo: ccodigo, descricao: descricaoEl.text().capitalize(), href: options.baseUrl + data.attr("href"), subclasses: [] }; if (classe.codigo.length == 7) { item.classes.push(classe); scrapSubclasses(classe); var classeCnae = new Cnae({ _id: classe.codigo, Secao: item.divisao.secao.codigo, Divisao: item.divisao.codigo, Grupo: item.codigo, Classe: classe.codigo, Subclasse: 0, Descricao: capitalize(classe.descricao) }); // classeCnae.save(function(err, ret) { // if (err) return console.error(err); // }); console.log(classe.grupo.divisao.secao.codigo + " " + classe.grupo.divisao.codigo + " " + classe.grupo.codigo + " " + classe.codigo + " " + classe.descricao); } }); } });
Client.prototype.convertEncoding = function (str) { // No Encoding, bail if (!this.opt.encoding) return str; try { // Detect the input charset const charset = charsetDetector.detect(str); // Determine if we think we have a good match, if not default to specified encoding const encodeTo = charset.confidence >= 1 ? charset.encoding : this.opt.encoding; // Return a string in the format specified in the configuration return iconv.decode(iconv.encode(str, encodeTo), this.opt.encoding); } catch (err) { if (this.opt.debug) console.log('\u001b[01;31mERROR: ' + err + '\u001b[0m'); } };
module.exports = function decode(text, charset) { if(!charset) { var encoding = jschardet.detect(text).encoding; charset = encoding || defaultEncoding; } if(!iconv.encodingExists(charset) || charset === defaultEncoding) { return he.decode(text.toString(defaultEncoding)); } return he.decode(iconv.decode(text, charset)); };
.then((data) => { const encoding = { 'ascii': 'cp866', 'UTF-8': 'utf8', 'windows-1251': 'win1251', }; let dataEncoding = detect(data).encoding; if (!dataEncoding || !encoding[dataEncoding]) { dataEncoding = process.platform.match(/win/) ? 'win1251' : 'utf8'; } const decoded = iconv.decode(data, dataEncoding); resolve(decoded); })
}, function (err, res, body) { if (err) console.error(err); var encoding = jschardet.detect(body).encoding; if (encoding !== "ascii" && encoding !== "utf-8") { var iconv = new Iconv(encoding, "UTF-8//TRANSLIT//IGNORE"); body = iconv.convert(body); } var $ = cheerio.load(body); var pageTitle = $("title").text(); console.log(pageTitle); });
fs.readFile( filePath, function( error, data ) { var encoding, decoded; if ( error ) { cb( error, null ); return; } try { encoding = jschardet.detect( data ).encoding.toLowerCase(); decoded = iconv.decode( data, encoding ); } catch ( e ) { cb( e ); return; } cb( null, decoded ); });
module.exports = function encode(text, charset) { if(!charset) { var encoding = jschardet.detect(text).encoding; if(encoding) { charset = encoding; } } if(!iconv.encodingExists(charset) || charset === 'utf-8') { return he.decode(text.toString('utf-8')); } return he.decode(iconv.decode(text, charset)); }
Downloader.responseDecode = function (response, body) { if (!response) { throw new Error('Пустой response. '); } if (!body) { throw new Error('Пустой body. '); } var enc = charset(response.headers, body); enc = enc || jschardet.detect(body).encoding; if (enc) { enc = enc.toLowerCase(); return he.decode(iconv.decode(body, enc)); } else { throw new Error('Unknown encoding '); } }
request({ url:item.href, encoding: null }, function(error, response, html) { if(!error) { html = iconv.decode(html, jschardet.detect(html).encoding); var $ = cheerio.load(html); $("td[width='105']").each(function() { var parent = $(this).parent(); var data = parent.children().eq(1).find("a"); var descricaoEl = parent.children().eq(2); var divisao = { secao: item, codigo: data.text(), descricao: descricaoEl.text().capitalize(), href: options.baseUrl + data.attr("href"), grupos: [] }; if (divisao.codigo.length == 2) { item.divisoes.push(divisao); scrapGrupos(divisao); var divisaoCnae = new Cnae({ _id: divisao.codigo, Secao: item.codigo, Divisao: divisao.codigo, Grupo: 0, Classe: 0, Subclasse: 0, Descricao: capitalize(divisao.descricao) }); // divisaoCnae.save(function(err, ret) { // if (err) return console.error(err); // }); console.log(divisao.secao.codigo + " " + divisao.codigo + " " + divisao.descricao); } }); } });