spider/spider.pl at master · fprintf/spider · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/perl
#

use strict;
use warnings;

use threads;
use threads::shared;
use LWP::UserAgent;

# 1. Remove the first website from the queue (FIFO i think makes sense here)
# 2. Get all links from website and add them to the queue (if they're not already in the queue)
# 3. Query known common directories/files from website (robots.txt check etc..) and add
# those paths as links to the queue as well if they hold any additional crawling information
# 5. Index the url based on the text content on the page in some distributed database (elastic search? postgresql? mongodb?)
# 4. Repeat from step 1
#
# Expanding on step 2: we need to ensure we're not adding items to the queue that already exist
# in the queue because that is a waste of time/duplicate work. To do this we need to keep track of all the known
# URLs we have and quickly check if we've indexed them already or they're already in queue to be indexed.
#
# If they already exist in the index db then we should check our expire time and see if we should update their
# content, if not we're done and don't add this to the queue.
#
# If the URL already exists in the queue to be indexed then we are again done because it's already been queued. Checking this
# may be difficult because we need a quick way to check if the item is in the queue, doing a linear search on the queue for a distributed system with potentially millions of items is not a good idea. Binary search is good enough though or O(log n) with a tree
#
#
#
#

{
    package Spider::Queue;
    my $expire_time = 300; # Time before we re-check a url
    sub new {
        my $class = shift;
        my $self = bless {}, $class;
        my @jobqueue :shared;
        my %db :shared;
        $self->{queue} = \@jobqueue;
        $self->{db} = \%db;
        return $self;
    }

    # Add an item to the job queue
    sub add {
        my ($self, @jobs) = @_;
        my @filtered = grep {
            if (!exists($self->{db}{$_}) || $self->{db}{$_}{expires} > time) {
                lock($self->{db});
                my %hash :shared = (expires => time + $expire_time);
                $self->{db}{$_} = \%hash;
                return 1;
            }
            return 0;
        } @jobs;
        lock($self->{queue});
        push(@{$self->{queue}}, @filtered);
    }

    # Get an item from the queue
    sub shift {
        my ($self) = @_;

        lock($self->{queue});
        my $item = shift @{$self->{queue}};
        return $item;
    }

    # Return true if the queue is empty
    sub empty {
        my ($self) = @_;
        return !@{$self->{queue}};
    }

    1;
}


my @targets = (
    'http://nytimes.com/'
);

my $job_queue = Spider::Queue->new;


foreach my $job (@targets) {
    $job_queue->add($job);
}

my @workers = make_workers(sub {
        my ($url) = @_;
        my $ua = LWP::UserAgent->new();
        my @new_links = search_url($ua, $url);

        foreach my $link (@new_links) {
            $job_queue->add($link);
        }
    }
);

cleanup_workers(@workers);


# Array of worker threadsb
sub make_workers {
    my ($function, $count) = @_;
    $count ||= 4;

    # Our list of threads we will be returning
    my @workers;

    for (my $i = 0; $i < $count; ++$i) {
        my $thread = threads->create(
            sub {
                my $count = 10;
                while (1) {
                    if ($job_queue->empty()) {
                        sleep(1);
                        # Only try $count times again before exiting
                        if (--$count == 0) {
                            last;
                        }
                        next;
                    }

                    my $job = $job_queue->shift();
                    $function->($job);
                }
            }
        );

        push(@workers, $thread);
    }

    return @workers;
}

sub cleanup_workers {
    my (@workers) = @_;

    while (my $worker = shift @workers) {
        # Join to thread and let it finish
        $worker->join();
    }
}

sub fixup_link {
    my ($link, $baseurl) = @_;
    if ($link =~ /^(?:(https?):\/\/|www\.)(.*)$/) {
        my $proto = $1 || 'http';
        $link = sprintf("%s://%s", $proto, $2);
    } else {
        $link =~ s/\/*$//g;
        $link = sprintf("%s/%s", $baseurl, $link);
    }
    return $link;
}

# Find all the links in content and return an array of them
sub get_links {
    my ($content, $baseurl) = @_;
    my @links;
    my $exempt_re = qr/\.(?:png|jpe?g|gif|mov|mp4|avi|vgif|webm)$/;


    while ($content =~ /[hH][rR][Ee][Ff]\s*=\s*['"]?\s*([^'"\s]+)\s*['"]?/gms) {
        my $link = fixup_link($1, $baseurl);
        if ($link =~ /$exempt_re/i) {
            print STDERR "exempted: [$link]\n";
            next;
        }
        print STDERR "found link: [$link]\n";
        push(@links, $link);
    }

    return @links;
}


# Get url and return all the links as an array on the page
# TODO also index the content of the page for fulltext search
sub search_url {
    my ($ua, $url) = @_;
    my @links;

    sleep 5 * rand(10);
    print STDERR "fetching url: $url\n";
    my $r = $ua->get($url);
    if (!$r->is_success()) {
        print STDERR "failed to fetch: $url: ".$r->status_line()."\n";
        return ();
    }

    # TODO index content for fulltext search and index by url
    #

    # Parse out all the links in the page
    @links = get_links($r->decoded_content(), $url);

    return @links;
}


1;