Exemple #1
0
module.exports.chunk_url = function(url, callback) {

    fu.read_url(url, function(err, txt) {
        _get_body(txt, function(err, nodes) {
            nodes = remove_tags(nodes, ['header', 'script']);
            nodes = remove_content(nodes, /id="toc"/);
            var flat_nodes = flatten_nodes(nodes);
            var sections = node_sections(flat_nodes);
            callback(null, sections);
        })
    })
}
Exemple #2
0
        this.put(page, function(err, page){
        file_utils.read_url(page.url,
        function(err, txt) {

            page.txt = txt;
            console.log(__filename + '::_reindex_links:: finding links in txt');
            file_utils.links_in_txt(txt,
            function(err, links) {
                if (err) {
                    console.log(__filename + ':: error in _reindex_links::links_in_txt');
                    console.log(err);
                    callback(err);
                } else {
                    self._update_links(self, page, links, callback);
                }
            })
        })
        })
Exemple #3
0
    _reindex_irc_lines: function(page, callback) {
        var self = this;
        var lines_being_put = 0;
        file_utils.read_url(page.url,
        function(err, txt) {
            file_utils.lines_in_txt(txt,
            function(err, new_lines) {

                var lines_model = require('models/lines');
                lines_model.model(function(err, model) {
                    model.delete({
                        url: page.url
                    },
                    function() {
                        new_lines.forEach(function(line, i) {
                            if (! (i % 1000)) {
                                console.log(__filename + ':: indexing line ' + i + ' of ' + new_lines.length + ' of page ' + page.url);
                            }
                            line.url = page.url; ++lines_being_put;
                            model.put(line,
                            function() {--lines_being_put;
                            });
                        });

                        var interval = setInterval(function() {
                            console.log(page.url + ': remaining lines: ' + lines_being_put);
                            if (lines_being_put < 1) {
                                console.log('ending parse of ' + page.url);
                                clearInterval(interval);

                                self.put(page,
                                function(err, page) {
                                    page.indexed = new Date();
                                    self.put(page, callback);
                                });
                            }
                        },
                        2000);
                    });
                });
            });
        });
    },