小爬虫阿小爬虫,专爬百度chinajoy吧所有图片,可以自己修改爬其他地方,以及爬的深度(太费时间,所以设置只爬前5页所有主题,每个主题只爬5页,可以随便修改爬的深度,广度),图片就自动下载下来喽
#!/usr/bin/perluse strict;
use warnings;
use Encode;
use LWP::UserAgent;
use LWP::Simple;
my $ua = LWP::UserAgent->new();
$ua -> timeout(10);
#$ua -> proxy('http', 'http://##########/');
my @is = (0,1,2,3,4);
my $picnum = 1;
my @pages;
foreach my $i(@is){
my $num = $i*50;
my $url = 'http://tieba.baidu.com/f?kw=chinajoy&pn='."$num";
my $r = $ua->get($url);
##encode source code,get useful data
if ($r -> is_success)
{
my $re = encode("utf8",$r -> decoded_content);
my $start = 'a href="';
my $end = '"';
while ($re=~ /$start(\/p\/\d*?)$end/gs)
{
push @pages,$1;
}
}
}
foreach my $page(@pages)
{
my @ns = (1,2,3,4,5);
foreach my $n(@ns)
{
my $url = 'http://tieba.baidu.com'."$page?pn="."$n";
print "$url\n";
my $r = $ua->get($url);
##encode source code,get useful data
if ($r -> is_success)
{
my $re = encode("utf8",$r -> decoded_content);
my $start = 'src="';
my $end = '"';
while ($re=~ /$start(http.*?jpg)$end/g)
{
my $res = $1;
getstore($res,$picnum.".jpg");
print $res;
$picnum++;
print "$picnum\n";
}
}
}
}