LibSynphony.prototype.getWordsFromHtmlString = function (textHTML, letters) {

    // replace html break with space
    var regex = /<br><\/br>|<br>|<br \/>|<br\/>|\r?\n/g;
    var s = textHTML.replace(regex, ' ').toLowerCase();

    var punct = "\\p{P}";

    if (letters) {
        // BL-1216 Use negative look-ahead to keep letters from being counted as punctuation
        // even if Unicode says something is a punctuation character when the user
        // has specified it as a letter (like single quote).
        punct = "(?![" + letters + "])" + punct;
    }
    /**************************************************************************
     * Replace punctuation in a sentence with a space.
     *
     * Preserves punctuation marks within a word (ex. hyphen, or an apostrophe
     * in a contraction)
     **************************************************************************/
    regex = XRegExp(
        '(^' + punct + '+)'                             // punctuation at the beginning of a string
        + '|(' + punct + '+[\\s\\p{Z}\\p{C}]+' + punct + '+)' // punctuation within a sentence, between 2 words (word" "word)
        + '|([\\s\\p{Z}\\p{C}]+' + punct + '+)'               // punctuation within a sentence, before a word
        + '|(' + punct + '+[\\s\\p{Z}\\p{C}]+)'               // punctuation within a sentence, after a word
        + '|(' + punct + '+$)',                         // punctuation at the end of a string
        'g');
    s = XRegExp.replace(s, regex, ' ');

    // split into words using Separator and Control characters
    // (ZERO WIDTH SPACE is a Control charactor.  See http://issues.bloomlibrary.org/youtrack/issue/BL-3933.)
    regex = XRegExp('[\\p{Z}\\p{C}]+', 'xg');
    return XRegExp.split(s.trim(), regex);
};
Exemple #2
0
module.exports.replaceToken = function(tokens, query) {
    var abbr = query;
    for (var i=0; i<tokens.length; i++) {
        if (tokens[i].named)
            abbr = XRegExp.replace(abbr, tokens[i].from, tokens[i].to);
        else
            abbr = abbr.replace(tokens[i].from, tokens[i].to);
    }

    return abbr;
}
Exemple #3
0
	uuid: function(seed) {
		// Create SHA hash from seed.
		var shaObj = new jsSHA("SHA-1", "TEXT");
		shaObj.update(seed);
		var hashStr = shaObj.getHash("HEX").substring(0, 32);
		// Build a uuid based on the md5
		var search = XRegExp('^(?<first>.{8})(?<second>.{4})(?<third>.{1})(?<fourth>.{3})(?<fifth>.{1})(?<sixth>.{3})(?<seventh>.{12}$)');
		var replace = XRegExp('${first}-${second}-3${fourth}-a${sixth}-${seventh}');
		// Replace regexp by corresponding mask, and remove / character at each side of the result.
		var uuid = XRegExp.replace(hashStr, search, replace).replace(/\//g, '');
		return uuid;
	},
LibSynphony.prototype.wrap_words_extra = function(
    storyHTML,
    aWords,
    cssClass,
    extra
) {
    if (aWords === undefined || aWords.length === 0) return storyHTML;

    if (storyHTML.trim().length === 0) return storyHTML;

    // make sure extra starts with a space
    if (extra.length > 0 && extra.substring(0, 1) !== " ") extra = " " + extra;

    var beforeWord = "(^|>|[\\s\\p{Z}]|\\p{P}|&nbsp;)"; // word beginning delimiter
    var afterWord =
        "(?=($|<|[\\s\\p{Z}]|\\p{P}+\\s|\\p{P}+<br|[\\s]*&nbsp;|\\p{P}+&nbsp;|\\p{P}+$))"; // word ending delimiter

    // escape special characters
    var escapedWords = aWords.map(RegExp.quote);

    var regex = new XRegExp(
        beforeWord + "(" + escapedWords.join("|") + ")" + afterWord,
        "xgi"
    );

    // We must not replace any occurrences inside <...>. For example, if html is abc <span class='word'>x</span>
    // and we are trying to wrap 'word', we should not change anything.
    // To prevent this we split the string into sections starting at <. If this is valid html, each except the first
    // should have exactly one >. We strip off everything up to the > and do the wrapping within the rest.
    // Finally we put the pieces back together.
    var parts = storyHTML.split("<");
    var modParts = [];
    for (var i = 0; i < parts.length; i++) {
        var text = parts[i];
        var prefix = "";
        if (i != 0) {
            var index = text.indexOf(">");
            prefix = text.substring(0, index + 1);
            text = text.substring(index + 1, text.length);
        }
        modParts.push(
            prefix +
                XRegExp.replace(
                    text,
                    regex,
                    '$1<span class="' + cssClass + '"' + extra + ">$2</span>"
                )
        );
    }

    return modParts.join("<");
};
function processSass(filepath) {
  console.log("Processing %s", filepath)
  var original_sass = fs.readFileSync(filepath, "utf8")

  var sass = XRegExp.replace(original_sass, mixins_re, function(match) {
    var replacement = mixins[match.mixin](match.args);

    if(replacement) {
      if(argv["dry-run"] || argv.verbose) {
        console.log("Replacing");
        console.log("\t", chalk.red(match.rule));
        console.log("\t", chalk.green(replacement));
      }
      return replacement;
    } else {
      console.error(chalk.bgRed("Cannot replace %s"), match.rule);
      return match.rule;
    }
  });

  if(argv.ignored) {
    if(other_mixin_re.test(sass)) {
      console.log("Ignored:")
      XRegExp.forEach(sass, other_mixin_re, function(match, i) {
        console.log("\t", chalk.magenta(match[1]))
      });
    }
  }

  if(argv["dry-run"]) {
    console.log("Done (*not* saved)");
  } else {
    fs.writeFileSync(filepath, sass, "utf8");
    console.log("Saved");
  }
}
LibSynphony.prototype.getWordsFromHtmlString = function(textHTML, letters) {
    // replace html break with space
    let regex = /<br><\/br>|<br>|<br \/>|<br\/>|\r?\n/g;
    let s = textHTML.replace(regex, " ").toLowerCase();

    let punct = "\\p{P}";

    if (letters) {
        // BL-1216 Use negative look-ahead to keep letters from being counted as punctuation
        // even if Unicode says something is a punctuation character when the user
        // has specified it as a letter (like single quote).
        punct = "(?![" + letters + "])" + punct;
    }
    /**************************************************************************
     * Replace punctuation in a sentence with a space.
     *
     * Preserves punctuation marks within a word (ex. hyphen, or an apostrophe
     * in a contraction)
     **************************************************************************/
    regex = XRegExp(
        "(^" +
        punct +
        "+)" + // punctuation at the beginning of a string
        "|(" +
        punct +
        "+[\\s\\p{Z}\\p{C}]+" +
        punct +
        "+)" + // punctuation within a sentence, between 2 words (word" "word)
        "|([\\s\\p{Z}\\p{C}]+" +
        punct +
        "+)" + // punctuation within a sentence, before a word
        "|(" +
        punct +
        "+[\\s\\p{Z}\\p{C}]+)" + // punctuation within a sentence, after a word
            "|(" +
            punct +
            "+$)", // punctuation at the end of a string
        "g"
    );
    s = XRegExp.replace(s, regex, " ");

    // Split into words using Separator and SOME Control characters
    // Originally the code had p{C} (all Control characters), but this was too all-encompassing.
    const whitespace = "\\p{Z}";
    const controlChars = "\\p{Cc}"; // "real" Control characters
    // The following constants are Control(format) [p{Cf}] characters that should split words.
    // e.g. ZERO WIDTH SPACE is a Control(format) charactor
    // (See http://issues.bloomlibrary.org/youtrack/issue/BL-3933),
    // but so are ZERO WIDTH JOINER and NON JOINER (See https://issues.bloomlibrary.org/youtrack/issue/BL-7081).
    // See list at: https://www.compart.com/en/unicode/category/Cf
    const zeroWidthSplitters = "\u200b"; // ZERO WIDTH SPACE
    const ltrrtl = "\u200e\u200f"; // LEFT-TO-RIGHT MARK / RIGHT-TO-LEFT MARK
    const directional = "\u202A-\u202E"; // more LTR/RTL/directional markers
    const isolates = "\u2066-\u2069"; // directional "isolate" markers
    // split on whitespace, Control(control) and some Control(format) characters
    regex = XRegExp(
        "[" +
            whitespace +
            controlChars +
            zeroWidthSplitters +
            ltrrtl +
            directional +
            isolates +
            "]+",
        "xg"
    );
    return XRegExp.split(s.trim(), regex);
};
const segmentsOnly = data.map($_ => {
   return XRegExp.replace(
      $_, // element to operate on
      XRegExp('.*<body>(.*?)</body>.*', 's'), // match pattern
      '$1') // replace pattern
})
LibSynphony.prototype.stringToSentences = function(textHTML) {

        // place holders
        var delimiter = String.fromCharCode(0);
        var htmlLineBreak = String.fromCharCode(1);    // html break tags count as white space
        var windowsLineBreak = String.fromCharCode(2); // CR and LF count as white space
        var nonSentence = String.fromCharCode(3);      // u0003 is used to indicate a segment that is not part of a sentence
        var tagHolderOpen = String.fromCharCode(4);    // u0004 is a replacement character for all other opening html tags
        var tagHolderClose = String.fromCharCode(5);   // u0005 is a replacement character for all other closing html tags
        var tagHolderSelf = String.fromCharCode(6);    // u0006 is a replacement character for all other self-closing html tags
        var tagHolderEmpty = String.fromCharCode(7);   // u0007 is a replacement character for empty html tags
        var nbsp = String.fromCharCode(8);             // u0008 is a replacement character for &nbsp;
        if (textHTML === null) textHTML = '';

        // look for html break tags, replace them with the htmlLineBreak place holder
        var regex = /(<br><\/br>|<br>|<br \/>|<br\/>)/g;
        textHTML = textHTML.replace(regex, htmlLineBreak);

        // look for Windows line breaks, replace them with the windowsLineBreak place holder
        regex = /(\r\n)/g;
        textHTML = textHTML.replace(regex, windowsLineBreak);

        // collect opening html tags and replace with tagHolderOpen place holder
        var openTags = textHTML.match(/<[^\/][^<>]+[^\/]>/g);
        textHTML = textHTML.replace(/<[^\/][^<>]+[^\/]>/g, tagHolderOpen);

        // collect closing html tags and replace with tagHolderClose place holder
        var closeTags = textHTML.match(/<[\/][^<>]+>/g);
        textHTML = textHTML.replace(/<[\/][^<>]+>/g, tagHolderClose);

        // collect self-closing html tags and replace with tagHolderSelf place holder
        var selfTags = textHTML.match(/<[^<>]+[\/]>/g);
        textHTML = textHTML.replace(/<[^<>]+[\/]>/g, tagHolderSelf);

        // collect empty html tags and replace with tagHolderEmpty place holder
        var emptyTags = textHTML.match(/\u0004\u0005/g);
        textHTML = textHTML.replace(/\u0004\u0005/g, tagHolderEmpty);

        // replace &nbsp; with nbsp
        textHTML = textHTML.replace(/&nbsp;/g, nbsp);

        // look for paragraph ending sequences
        regex = XRegExp(
                '[^\\p{PEP}]*[\\p{PEP}]+' // break on all paragraph ending punctuation (PEP)
                + '|[^\\p{PEP}]+$',
                'g');

        // break the text into paragraphs
        var paragraphs = XRegExp.match(textHTML, regex);

    // We require at least one space between sentences, unless things have been configured so that
    // space IS a sentence-ending punctuation. In that case, zero or more.
    var intersentenceSpace = '([\\s\\p{PEP}\\u0006\\u0007\\u0008]' +
        (LibSynphony.prototype.extraSentencePunct && LibSynphony.prototype.extraSentencePunct.indexOf('\\u0020') >= 0 ? '*' : '+') +
        ')';

        // regex to find sentence ending sequences and inter-sentence space
        regex = XRegExp(
                '([\\p{SEP}]+'                      // sentence ending punctuation (SEP)
                // Note that categories Pf and Pi can both act as either Ps or Pe
                // (See https://issues.bloomlibrary.org/youtrack/issue/BL-5063.)
                + '[\'"\\p{Pe}\\p{Pf}\\p{Pi}\\u0005]*)' // characters that can follow the SEP
                + '([\\u0004]*)'                    // opening tag between sentences
                + intersentenceSpace
                + '([\\u0005]*)'                    // closing tag between sentences
                + '(?![^\\p{L}]*'                   // may be followed by non-letter chars
                + '[\\p{Ll}\\p{SCP}]+)',            // first letter following is not lower case
                'g');

        var returnVal = new Array();
        for (var i = 0; i < paragraphs.length; i++) {

                // mark boundaries between sentences and inter-sentence space
                var paragraph = XRegExp.replace(paragraphs[i], regex, '$1' + delimiter + nonSentence + '$2' + '$3' + '$4' + delimiter);

                // restore line breaks
                paragraph = paragraph.replace(/\u0001/g, '<br />');
                paragraph = paragraph.replace(/\u0002/g, '\r\n');

                // split the paragraph into sentences and
                var fragments = paragraph.split(delimiter);
                for (var j = 0; j < fragments.length; j++) {

                        var fragment = fragments[j];

                        // put the opening html tags back in
                        while (fragment.indexOf('\u0007') > - 1)
                                fragment = fragment.replace(/\u0007/, emptyTags.shift());

                        // put the opening html tags back in
                        while (fragment.indexOf('\u0004') > - 1)
                                fragment = fragment.replace(/\u0004/, openTags.shift());

                        // put the closing html tags back in
                        while (fragment.indexOf('\u0005') > - 1)
                                fragment = fragment.replace(/\u0005/, closeTags.shift());

                        // put the self-closing html tags back in
                        while (fragment.indexOf('\u0006') > - 1)
                                fragment = fragment.replace(/\u0006/, selfTags.shift());

                        // put nbsp back in
                        fragment = fragment.replace(/\u0008/g, "&nbsp;");

                        // check to avoid blank segments at the end
                        if ((j < (fragments.length - 1)) || (fragment.length > 0)) {

                                // is this space between sentences?
                                if (fragment.substring(0, 1) === nonSentence)
                                        returnVal.push(new TextFragment(fragment.substring(1), true));
                                else
                                        returnVal.push(new TextFragment(fragment, false));
                        }
                }
        }

        return returnVal;
};