基于PHP实现的网络蜘蛛,给予数个URL种子,不断查询URL页面内容,分析出新的URL放入待抓取队列,分析当前页面的所有邮箱地址,分析当前页面的迅雷账号密码,存储查询数据到文件。
抓取方案使用 宽度优先遍历策略
去重URL使用 bloom filter
连接URL使用 CURL
正常情况下应该尊重他人的URL,先爬取URL根目录下的robots.txt,根据网站管理员的意愿进行相关数据爬取。但此处例子所为研究,所有没有做这一步。
class bloomfilter {
//bit数组的宽度
protected $m;
//使用的hash函数的个数
protected $k;
//当前已加入合集个数
protected $n;
//bit数组
protected $bitset;
//初始化
public function __construct($m, $n) {
$this->m = $m;
$this->k = ceil(($m/$n)*log(2));//计算最优的hash函数个数:当hash函数个数k=(ln2)*(m/n)时错误率最小
$this->n = 0;
$this->bitset = array_fill(0, $this->m, false);//位数组
}
//False Positive的比率:f = (1 – e-kn/m)k
public function getFalsePositiveProbability() {
$exp = (-1 * $this->k * $this->n) / $this->m;
return pow(1 - exp($exp), $this->k);
}
//添加数据到集合
public function add($key) {
if (is_array($key)) {
foreach ($key as $k) {
$this->add($k);
}
return;
}
foreach ($this->getSlots($key) as $slot) {
$this->bitset[$slot] = true;
}
$this->n++;
}
//校验数据是否存在集合中
public function contains($key) {
if (is_array($key)) {
foreach ($key as $k) {
if ($this->contains($k) == false) {
return false;
}
}
return true;
}
foreach ($this->getSlots($key) as $slot) {
if ($this->bitset[$slot] == false) {
return false;
}
}
return true;
}
//计算hash值给出的位置
protected function getSlots($key) {
$slots = array();
//使用CRC32产生一个32bit的校验值
//由于CRC32产生校验值时源数据块的每一bit都会被计算,所以数据块中即使只有一位发生了变化,也会得到不同的CRC32值
$hash = crc32($key);
//随机数播种
mt_srand($hash);
//执行K次hash函数,获取对应位置
for ($i = 0; $i < $this->k; $i++) {
$slots[] = mt_rand(0, $this->m-1); //mt_srand与mt_rand连用
}
return $slots;
}
}
class spider {
//去重
protected $bloomfilter;
//待爬取队列
protected $wait_urls = array();
//爬取深度
protected $crawl_num;
//查询结构写入文件中
protected $filepath = "phpspider_data.txt";
protected $fp;
public function __construct($crawl_num) {
$this->bloomfilter = new bloomfilter(500, 100);
$this->crawl_num = $crawl_num;
$this->fp = fopen($this->filepath, 'w');
}
//连接http,获取数据
public function get_httpdata($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//将curl_exec()获取的信息以文件流的形式返回,而不是直接输出
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https请求 不验证证书
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);//https请求 不验证hosts
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
//抓取网页下的所有邮箱
public function parse_email($data) {
$pattern = "/([a-z0-9\-_\.]+@[a-z0-9]+\.[a-z0-9\-_\.]+)/";
preg_match_all($pattern, $data, $regArr);
$emails = array();
for ($i=0; $i<count($regArr); $i++) {
foreach ($regArr[$i] as $key => $email) {
if ( ! $this->bloomfilter->contains($email)) { //去重
$this->bloomfilter->add($email);
array_push($emails, $email);
fwrite($this->fp, $email."\n");
}
}
}
return $emails;
}
//爬取迅雷VIP
public function parse_xunlei($data) {
$pattern = "/VIP分享网迅雷会员号\d+.*密码.\w*/";
preg_match_all($pattern, $data, $regArr);
$xunleivips = array();
for ($i=0; $i<count($regArr); $i++) {
foreach ($regArr[$i] as $key => $vip) {
if ( ! $this->bloomfilter->contains($vip)) { //去重
$this->bloomfilter->add($vip);
array_push($xunleivips, $vip);
fwrite($this->fp, $vip."\n");
echo $vip."\n";
}
}
}
return $xunleivips;
}
//抓取网页下的所有新的URL
public function parse_newurl($url, $data) {
if (strstr($url, "https")) {
$site=substr($url,0,strpos($url,"/",9)); }
else {
$site=substr($url,0,strpos($url,"/",8)); }
$base=substr($url,0,strrpos($url,"/")+1);
//使用正则匹配所有href=
$pattern="|href=['\"]?([^ '\"]+)['\" ]|U";
preg_match_all($pattern, $data, $regArr, PREG_SET_ORDER);
//找出所有匹配的链接
$newurl_list = array();
for ($i=0; $i<count($regArr); $i++) {
$tmp = $regArr[$i][1];
//判断是否是相对路径,即是否还有://
if (!eregi("//",$tmp)) {
//是否是站点的根目录
if (substr($tmp, 0, 1) == "/") {
$newurl = $site.$tmp;
}
else {
$newurl = $base.$tmp;
}
}
else {
$newurl = $tmp;
}
if (strstr($newurl, "http")) {
if ( ! $this->bloomfilter->contains($newurl)) { //去重
$this->bloomfilter->add($newurl);
array_push($newurl_list, $newurl);
fwrite($this->fp, $newurl."\n");
}
}
}
return $newurl_list;
}
//执行爬虫
public function start($seeds) {
foreach ($seeds as $key => $url) {
array_push($this->wait_urls, $url);
}
$crawling_num = 1;
while ($crawling_num <= $this->crawl_num) {
$urls = array();
foreach ($this->wait_urls as $key => $url) {
$data = $this->get_httpdata($url);
$newurl_list = $this->parse_newurl($url, $data);
$emails = $this->parse_email($data);
$this->parse_xunlei($data);
if ($newurl_list) {
foreach ($newurl_list as $key => $value) {
array_push($urls, $value);
}
}
}
if (empty($urls)) {
break;
}
$this->wait_urls = $urls;
$crawling_num = $crawling_num+1;
}
fclose($this->fp);
}
}
$seeds = array(
"http://www.vipfenxiang.com/xunlei/",
);
$spider = new spider(2);//抓取深度
$spider->start($seeds);
打印结果
VIP分享网迅雷会员号105903134:1密码9112869
VIP分享网迅雷会员号121748863:2密码9408329
VIP分享网迅雷会员号51717:1密码9917189
VIP分享网迅雷会员号895909586:1密码9609849
VIP分享网迅雷会员号113444810:1密码9798679
VIP分享网迅雷会员号893333915:2密码9034889
VIP分享网迅雷会员号591295202:1密码9025829
VIP分享网迅雷会员号893945407:2密码9500839
VIP分享网迅雷会员号724006306:2密码9612349
VIP分享网迅雷会员号894245093:1密码9345689
VIP分享网迅雷会员号242689460:1密码9322579
VIP分享网迅雷会员号146376319:2密码9780319
VIP分享网迅雷会员号845429536:1密码9736349
VIP分享网迅雷会员号113573595:2密码9594039
VIP分享网迅雷会员号798787259:2密码9645789
VIP分享网迅雷会员号902948934:1密码9920719
VIP分享网迅雷会员号893013018:1密码9516069
VIP分享网迅雷会员号123586914:1密码9937689
VIP分享网迅雷会员号421190942:2密码9519779
VIP分享网迅雷会员号842741066:2密码9796549
VIP分享网迅雷会员号731396967:1密码9000939
VIP分享网迅雷会员号838797970:2密码9168859
VIP分享网迅雷会员号198231qin:1密码9775239
VIP分享网迅雷会员号395922976:1密码9705719
VIP分享网迅雷会员号269560190:2密码9503149
VIP分享网迅雷会员号903066726:2密码9745109
VIP分享网迅雷会员号776447864:2密码9847259
VIP分享网迅雷会员号828859858:2密码9428479
VIP分享网迅雷会员号774256581:2密码9500049
VIP分享网迅雷会员号903142190:1密码9653049
VIP分享网迅雷会员号830649436:1密码9772179
VIP分享网迅雷会员号790262168:1密码9601009
VIP分享网迅雷会员号834895524:1密码9267899
VIP分享网迅雷会员号840313877:1密码9178639
VIP分享网迅雷会员号896151846:2密码9074189
VIP分享网迅雷会员号793535475:2密码9260239
VIP分享网迅雷会员号149244502:1密码9105749
VIP分享网迅雷会员号894138315:2密码9822639
VIP分享网迅雷会员号895467021:1密码9654069
VIP分享网迅雷会员号13829266904:2密码9290379
VIP分享网迅雷会员号353615200:1密码9749319
VIP分享网迅雷会员号311274570:1密码9656289
VIP分享网迅雷会员号893243411:1密码9161039
VIP分享网迅雷会员号308349140:2密码9933679
VIP分享网迅雷会员号892730567:2密码9770929
VIP分享网迅雷会员号211708473:1密码9162589
VIP分享网迅雷会员号838462015:1密码9523929
VIP分享网迅雷会员号758415243:2密码9121699
VIP分享网迅雷会员号696094939:2密码9015009
VIP分享网迅雷会员号916301164:1密码9800369
VIP分享网迅雷会员号894998314:1密码9753019
详细可以查看执行后的数据文件:phpspider_data.txt