LibSynphony.prototype.getWordsFromHtmlString = function (textHTML, letters) { // replace html break with space var regex = /<br><\/br>|<br>|<br \/>|<br\/>|\r?\n/g; var s = textHTML.replace(regex, ' ').toLowerCase(); var punct = "\\p{P}"; if (letters) { // BL-1216 Use negative look-ahead to keep letters from being counted as punctuation // even if Unicode says something is a punctuation character when the user // has specified it as a letter (like single quote). punct = "(?![" + letters + "])" + punct; } /************************************************************************** * Replace punctuation in a sentence with a space. * * Preserves punctuation marks within a word (ex. hyphen, or an apostrophe * in a contraction) **************************************************************************/ regex = XRegExp( '(^' + punct + '+)' // punctuation at the beginning of a string + '|(' + punct + '+[\\s\\p{Z}\\p{C}]+' + punct + '+)' // punctuation within a sentence, between 2 words (word" "word) + '|([\\s\\p{Z}\\p{C}]+' + punct + '+)' // punctuation within a sentence, before a word + '|(' + punct + '+[\\s\\p{Z}\\p{C}]+)' // punctuation within a sentence, after a word + '|(' + punct + '+$)', // punctuation at the end of a string 'g'); s = XRegExp.replace(s, regex, ' '); // split into words using Separator and Control characters // (ZERO WIDTH SPACE is a Control charactor. See http://issues.bloomlibrary.org/youtrack/issue/BL-3933.) regex = XRegExp('[\\p{Z}\\p{C}]+', 'xg'); return XRegExp.split(s.trim(), regex); };
module.exports.replaceToken = function(tokens, query) { var abbr = query; for (var i=0; i<tokens.length; i++) { if (tokens[i].named) abbr = XRegExp.replace(abbr, tokens[i].from, tokens[i].to); else abbr = abbr.replace(tokens[i].from, tokens[i].to); } return abbr; }
uuid: function(seed) { // Create SHA hash from seed. var shaObj = new jsSHA("SHA-1", "TEXT"); shaObj.update(seed); var hashStr = shaObj.getHash("HEX").substring(0, 32); // Build a uuid based on the md5 var search = XRegExp('^(?<first>.{8})(?<second>.{4})(?<third>.{1})(?<fourth>.{3})(?<fifth>.{1})(?<sixth>.{3})(?<seventh>.{12}$)'); var replace = XRegExp('${first}-${second}-3${fourth}-a${sixth}-${seventh}'); // Replace regexp by corresponding mask, and remove / character at each side of the result. var uuid = XRegExp.replace(hashStr, search, replace).replace(/\//g, ''); return uuid; },
LibSynphony.prototype.wrap_words_extra = function( storyHTML, aWords, cssClass, extra ) { if (aWords === undefined || aWords.length === 0) return storyHTML; if (storyHTML.trim().length === 0) return storyHTML; // make sure extra starts with a space if (extra.length > 0 && extra.substring(0, 1) !== " ") extra = " " + extra; var beforeWord = "(^|>|[\\s\\p{Z}]|\\p{P}| )"; // word beginning delimiter var afterWord = "(?=($|<|[\\s\\p{Z}]|\\p{P}+\\s|\\p{P}+<br|[\\s]* |\\p{P}+ |\\p{P}+$))"; // word ending delimiter // escape special characters var escapedWords = aWords.map(RegExp.quote); var regex = new XRegExp( beforeWord + "(" + escapedWords.join("|") + ")" + afterWord, "xgi" ); // We must not replace any occurrences inside <...>. For example, if html is abc <span class='word'>x</span> // and we are trying to wrap 'word', we should not change anything. // To prevent this we split the string into sections starting at <. If this is valid html, each except the first // should have exactly one >. We strip off everything up to the > and do the wrapping within the rest. // Finally we put the pieces back together. var parts = storyHTML.split("<"); var modParts = []; for (var i = 0; i < parts.length; i++) { var text = parts[i]; var prefix = ""; if (i != 0) { var index = text.indexOf(">"); prefix = text.substring(0, index + 1); text = text.substring(index + 1, text.length); } modParts.push( prefix + XRegExp.replace( text, regex, '$1<span class="' + cssClass + '"' + extra + ">$2</span>" ) ); } return modParts.join("<"); };
function processSass(filepath) { console.log("Processing %s", filepath) var original_sass = fs.readFileSync(filepath, "utf8") var sass = XRegExp.replace(original_sass, mixins_re, function(match) { var replacement = mixins[match.mixin](match.args); if(replacement) { if(argv["dry-run"] || argv.verbose) { console.log("Replacing"); console.log("\t", chalk.red(match.rule)); console.log("\t", chalk.green(replacement)); } return replacement; } else { console.error(chalk.bgRed("Cannot replace %s"), match.rule); return match.rule; } }); if(argv.ignored) { if(other_mixin_re.test(sass)) { console.log("Ignored:") XRegExp.forEach(sass, other_mixin_re, function(match, i) { console.log("\t", chalk.magenta(match[1])) }); } } if(argv["dry-run"]) { console.log("Done (*not* saved)"); } else { fs.writeFileSync(filepath, sass, "utf8"); console.log("Saved"); } }
LibSynphony.prototype.getWordsFromHtmlString = function(textHTML, letters) { // replace html break with space let regex = /<br><\/br>|<br>|<br \/>|<br\/>|\r?\n/g; let s = textHTML.replace(regex, " ").toLowerCase(); let punct = "\\p{P}"; if (letters) { // BL-1216 Use negative look-ahead to keep letters from being counted as punctuation // even if Unicode says something is a punctuation character when the user // has specified it as a letter (like single quote). punct = "(?![" + letters + "])" + punct; } /************************************************************************** * Replace punctuation in a sentence with a space. * * Preserves punctuation marks within a word (ex. hyphen, or an apostrophe * in a contraction) **************************************************************************/ regex = XRegExp( "(^" + punct + "+)" + // punctuation at the beginning of a string "|(" + punct + "+[\\s\\p{Z}\\p{C}]+" + punct + "+)" + // punctuation within a sentence, between 2 words (word" "word) "|([\\s\\p{Z}\\p{C}]+" + punct + "+)" + // punctuation within a sentence, before a word "|(" + punct + "+[\\s\\p{Z}\\p{C}]+)" + // punctuation within a sentence, after a word "|(" + punct + "+$)", // punctuation at the end of a string "g" ); s = XRegExp.replace(s, regex, " "); // Split into words using Separator and SOME Control characters // Originally the code had p{C} (all Control characters), but this was too all-encompassing. const whitespace = "\\p{Z}"; const controlChars = "\\p{Cc}"; // "real" Control characters // The following constants are Control(format) [p{Cf}] characters that should split words. // e.g. ZERO WIDTH SPACE is a Control(format) charactor // (See http://issues.bloomlibrary.org/youtrack/issue/BL-3933), // but so are ZERO WIDTH JOINER and NON JOINER (See https://issues.bloomlibrary.org/youtrack/issue/BL-7081). // See list at: https://www.compart.com/en/unicode/category/Cf const zeroWidthSplitters = "\u200b"; // ZERO WIDTH SPACE const ltrrtl = "\u200e\u200f"; // LEFT-TO-RIGHT MARK / RIGHT-TO-LEFT MARK const directional = "\u202A-\u202E"; // more LTR/RTL/directional markers const isolates = "\u2066-\u2069"; // directional "isolate" markers // split on whitespace, Control(control) and some Control(format) characters regex = XRegExp( "[" + whitespace + controlChars + zeroWidthSplitters + ltrrtl + directional + isolates + "]+", "xg" ); return XRegExp.split(s.trim(), regex); };
const segmentsOnly = data.map($_ => { return XRegExp.replace( $_, // element to operate on XRegExp('.*<body>(.*?)</body>.*', 's'), // match pattern '$1') // replace pattern })
LibSynphony.prototype.stringToSentences = function(textHTML) { // place holders var delimiter = String.fromCharCode(0); var htmlLineBreak = String.fromCharCode(1); // html break tags count as white space var windowsLineBreak = String.fromCharCode(2); // CR and LF count as white space var nonSentence = String.fromCharCode(3); // u0003 is used to indicate a segment that is not part of a sentence var tagHolderOpen = String.fromCharCode(4); // u0004 is a replacement character for all other opening html tags var tagHolderClose = String.fromCharCode(5); // u0005 is a replacement character for all other closing html tags var tagHolderSelf = String.fromCharCode(6); // u0006 is a replacement character for all other self-closing html tags var tagHolderEmpty = String.fromCharCode(7); // u0007 is a replacement character for empty html tags var nbsp = String.fromCharCode(8); // u0008 is a replacement character for if (textHTML === null) textHTML = ''; // look for html break tags, replace them with the htmlLineBreak place holder var regex = /(<br><\/br>|<br>|<br \/>|<br\/>)/g; textHTML = textHTML.replace(regex, htmlLineBreak); // look for Windows line breaks, replace them with the windowsLineBreak place holder regex = /(\r\n)/g; textHTML = textHTML.replace(regex, windowsLineBreak); // collect opening html tags and replace with tagHolderOpen place holder var openTags = textHTML.match(/<[^\/][^<>]+[^\/]>/g); textHTML = textHTML.replace(/<[^\/][^<>]+[^\/]>/g, tagHolderOpen); // collect closing html tags and replace with tagHolderClose place holder var closeTags = textHTML.match(/<[\/][^<>]+>/g); textHTML = textHTML.replace(/<[\/][^<>]+>/g, tagHolderClose); // collect self-closing html tags and replace with tagHolderSelf place holder var selfTags = textHTML.match(/<[^<>]+[\/]>/g); textHTML = textHTML.replace(/<[^<>]+[\/]>/g, tagHolderSelf); // collect empty html tags and replace with tagHolderEmpty place holder var emptyTags = textHTML.match(/\u0004\u0005/g); textHTML = textHTML.replace(/\u0004\u0005/g, tagHolderEmpty); // replace with nbsp textHTML = textHTML.replace(/ /g, nbsp); // look for paragraph ending sequences regex = XRegExp( '[^\\p{PEP}]*[\\p{PEP}]+' // break on all paragraph ending punctuation (PEP) + '|[^\\p{PEP}]+$', 'g'); // break the text into paragraphs var paragraphs = XRegExp.match(textHTML, regex); // We require at least one space between sentences, unless things have been configured so that // space IS a sentence-ending punctuation. In that case, zero or more. var intersentenceSpace = '([\\s\\p{PEP}\\u0006\\u0007\\u0008]' + (LibSynphony.prototype.extraSentencePunct && LibSynphony.prototype.extraSentencePunct.indexOf('\\u0020') >= 0 ? '*' : '+') + ')'; // regex to find sentence ending sequences and inter-sentence space regex = XRegExp( '([\\p{SEP}]+' // sentence ending punctuation (SEP) // Note that categories Pf and Pi can both act as either Ps or Pe // (See https://issues.bloomlibrary.org/youtrack/issue/BL-5063.) + '[\'"\\p{Pe}\\p{Pf}\\p{Pi}\\u0005]*)' // characters that can follow the SEP + '([\\u0004]*)' // opening tag between sentences + intersentenceSpace + '([\\u0005]*)' // closing tag between sentences + '(?![^\\p{L}]*' // may be followed by non-letter chars + '[\\p{Ll}\\p{SCP}]+)', // first letter following is not lower case 'g'); var returnVal = new Array(); for (var i = 0; i < paragraphs.length; i++) { // mark boundaries between sentences and inter-sentence space var paragraph = XRegExp.replace(paragraphs[i], regex, '$1' + delimiter + nonSentence + '$2' + '$3' + '$4' + delimiter); // restore line breaks paragraph = paragraph.replace(/\u0001/g, '<br />'); paragraph = paragraph.replace(/\u0002/g, '\r\n'); // split the paragraph into sentences and var fragments = paragraph.split(delimiter); for (var j = 0; j < fragments.length; j++) { var fragment = fragments[j]; // put the opening html tags back in while (fragment.indexOf('\u0007') > - 1) fragment = fragment.replace(/\u0007/, emptyTags.shift()); // put the opening html tags back in while (fragment.indexOf('\u0004') > - 1) fragment = fragment.replace(/\u0004/, openTags.shift()); // put the closing html tags back in while (fragment.indexOf('\u0005') > - 1) fragment = fragment.replace(/\u0005/, closeTags.shift()); // put the self-closing html tags back in while (fragment.indexOf('\u0006') > - 1) fragment = fragment.replace(/\u0006/, selfTags.shift()); // put nbsp back in fragment = fragment.replace(/\u0008/g, " "); // check to avoid blank segments at the end if ((j < (fragments.length - 1)) || (fragment.length > 0)) { // is this space between sentences? if (fragment.substring(0, 1) === nonSentence) returnVal.push(new TextFragment(fragment.substring(1), true)); else returnVal.push(new TextFragment(fragment, false)); } } } return returnVal; };