Beispiel #1
0
import {
  DATE_PUBLISHED_META_TAGS,
  DATE_PUBLISHED_SELECTORS,
  DATE_PUBLISHED_URL_RES,
} from './constants';

const GenericDatePublishedExtractor = {
  extract({ $, url, metaCache }) {
    let datePublished;
    // First, check to see if we have a matching meta tag
    // that we can make use of.
    // Don't try cleaning tags from this string
    datePublished = extractFromMeta(
      $,
      DATE_PUBLISHED_META_TAGS,
      metaCache,
      false
    );
    if (datePublished) return cleanDatePublished(datePublished);

    // Second, look through our selectors looking for potential
    // date_published's.
    datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
    if (datePublished) return cleanDatePublished(datePublished);

    // Lastly, look to see if a dately string exists in the URL
    datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
    if (datePublished) return cleanDatePublished(datePublished);

    return null;
  },
Beispiel #2
0
  return hostname;
}

function result(url) {
  return {
    url,
    domain: parseDomain(url),
  };
}

const GenericUrlExtractor = {
  extract({ $, url, metaCache }) {
    const $canonical = $('link[rel=canonical]');
    if ($canonical.length !== 0) {
      const href = $canonical.attr('href');
      if (href) {
        return result(href);
      }
    }

    const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache);
    if (metaUrl) {
      return result(metaUrl);
    }

    return result(url);
  },
};

export default GenericUrlExtractor;