杨旭光的网站

基于PHP实现的网络蜘蛛，给予数个URL种子，不断查询URL页面内容，分析出新的URL放入待抓取队列，分析当前页面的所有邮箱地址，分析当前页面的迅雷账号密码，存储查询数据到文件。

抓取方案使用 宽度优先遍历策略

去重URL使用 bloom filter

连接URL使用 CURL

正常情况下应该尊重他人的URL，先爬取URL根目录下的robots.txt，根据网站管理员的意愿进行相关数据爬取。但此处例子所为研究，所有没有做这一步。

class bloomfilter {

    //bit数组的宽度
    protected $m;

    //使用的hash函数的个数
    protected $k;

    //当前已加入合集个数
    protected $n;

    //bit数组
    protected $bitset;

    //初始化
    public function __construct($m, $n) {
        $this->m = $m;
        $this->k = ceil(($m/$n)*log(2));//计算最优的hash函数个数:当hash函数个数k=(ln2)*(m/n)时错误率最小
        $this->n = 0;
        $this->bitset = array_fill(0, $this->m, false);//位数组
    }

    //False Positive的比率：f = (1 – e-kn/m)k
    public function getFalsePositiveProbability() {
        $exp = (-1 * $this->k * $this->n) / $this->m;
        return pow(1 - exp($exp),  $this->k);
    }

    //添加数据到集合
    public function add($key) {
        if (is_array($key)) {
            foreach ($key as $k) {
                $this->add($k);
            }
            return;
        }
        foreach ($this->getSlots($key) as $slot) {
            $this->bitset[$slot] = true;
        }
        $this->n++;
    }

    //校验数据是否存在集合中
    public function contains($key) {
        if (is_array($key)) {
            foreach ($key as $k) {
            if ($this->contains($k) == false) {
                return false;
                }
            }
            return true;
        }
        foreach ($this->getSlots($key) as $slot) {
            if ($this->bitset[$slot] == false) {
                return false;
            }
        }
        return true;
    }

    //计算hash值给出的位置
    protected function getSlots($key) {
        $slots = array();
        //使用CRC32产生一个32bit的校验值
        //由于CRC32产生校验值时源数据块的每一bit都会被计算，所以数据块中即使只有一位发生了变化，也会得到不同的CRC32值
        $hash = crc32($key);
        //随机数播种
        mt_srand($hash);
        //执行K次hash函数,获取对应位置
        for ($i = 0; $i < $this->k; $i++) {
            $slots[] = mt_rand(0, $this->m-1); //mt_srand与mt_rand连用
        }
        return $slots;
    }
}

class spider {

    //去重
    protected $bloomfilter;
    //待爬取队列
    protected $wait_urls = array();
    //爬取深度
    protected $crawl_num;
    //查询结构写入文件中
    protected $filepath = "phpspider_data.txt";
    protected $fp;

    public function __construct($crawl_num) {
        $this->bloomfilter = new bloomfilter(500, 100);
        $this->crawl_num = $crawl_num;
        $this->fp = fopen($this->filepath, 'w');
    }

    //连接http,获取数据
    public function get_httpdata($url) {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_HEADER, 1);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//将curl_exec()获取的信息以文件流的形式返回，而不是直接输出
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https请求 不验证证书
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);//https请求 不验证hosts

        $data = curl_exec($ch);
        curl_close($ch);

        return $data;
    }

    //抓取网页下的所有邮箱
    public function parse_email($data) {
        $pattern = "/([a-z0-9\-_\.]+@[a-z0-9]+\.[a-z0-9\-_\.]+)/";
        preg_match_all($pattern, $data, $regArr);
        $emails = array();
        for ($i=0; $i<count($regArr); $i++) {
            foreach ($regArr[$i] as $key => $email) {
                if ( ! $this->bloomfilter->contains($email)) {  //去重
                    $this->bloomfilter->add($email);
                    array_push($emails, $email);
                    fwrite($this->fp, $email."\n");
                }
            }
        }
        return $emails;
    }

    //爬取迅雷VIP
    public function parse_xunlei($data) {
        $pattern = "/VIP分享网迅雷会员号\d+.*密码.\w*/";
        preg_match_all($pattern, $data, $regArr);
        $xunleivips = array();
        for ($i=0; $i<count($regArr); $i++) {
            foreach ($regArr[$i] as $key => $vip) {
                if ( ! $this->bloomfilter->contains($vip)) {  //去重
                    $this->bloomfilter->add($vip);
                    array_push($xunleivips, $vip);
                    fwrite($this->fp, $vip."\n");
                    echo $vip."\n";
                }
            }
        }
        return $xunleivips;
    }

    //抓取网页下的所有新的URL
    public function parse_newurl($url, $data) {
        if (strstr($url, "https")) {
            $site=substr($url,0,strpos($url,"/",9)); }
        else {
            $site=substr($url,0,strpos($url,"/",8)); }
        $base=substr($url,0,strrpos($url,"/")+1);

        //使用正则匹配所有href=
        $pattern="|href=['\"]?([^ '\"]+)['\" ]|U";
        preg_match_all($pattern, $data, $regArr, PREG_SET_ORDER);

        //找出所有匹配的链接
        $newurl_list = array();
        for ($i=0; $i<count($regArr); $i++) {
            $tmp = $regArr[$i][1];
            //判断是否是相对路径，即是否还有://
            if (!eregi("//",$tmp)) {
                //是否是站点的根目录
                if (substr($tmp, 0, 1) == "/") {
                    $newurl = $site.$tmp;
                }
                else {
                    $newurl = $base.$tmp;
                }
            }
            else {
                $newurl = $tmp;
            }

            if (strstr($newurl, "http")) {
                if ( ! $this->bloomfilter->contains($newurl)) { //去重
                    $this->bloomfilter->add($newurl);
                    array_push($newurl_list, $newurl);
                    fwrite($this->fp, $newurl."\n");
                }
            }
        }

        return $newurl_list;
    }

    //执行爬虫
    public function start($seeds) {
        foreach ($seeds as $key => $url) {
            array_push($this->wait_urls, $url);
        }

        $crawling_num = 1;
        while ($crawling_num <= $this->crawl_num) {
            $urls = array();
            foreach ($this->wait_urls as $key => $url) {
                $data = $this->get_httpdata($url);
                $newurl_list = $this->parse_newurl($url, $data);
                $emails = $this->parse_email($data);
                $this->parse_xunlei($data);

                if ($newurl_list) {
                    foreach ($newurl_list as $key => $value) {
                        array_push($urls, $value);
                    }
                }
            }
            if (empty($urls)) {
                break;
            }
            $this->wait_urls = $urls;
            $crawling_num = $crawling_num+1;
        }
        fclose($this->fp);
    }
}

$seeds = array(
    "http://www.vipfenxiang.com/xunlei/",
);
$spider = new spider(2);//抓取深度
$spider->start($seeds);

打印结果

VIP分享网迅雷会员号105903134:1密码9112869
VIP分享网迅雷会员号121748863:2密码9408329
VIP分享网迅雷会员号51717:1密码9917189
VIP分享网迅雷会员号895909586:1密码9609849
VIP分享网迅雷会员号113444810:1密码9798679
VIP分享网迅雷会员号893333915:2密码9034889
VIP分享网迅雷会员号591295202:1密码9025829
VIP分享网迅雷会员号893945407:2密码9500839
VIP分享网迅雷会员号724006306:2密码9612349
VIP分享网迅雷会员号894245093:1密码9345689
VIP分享网迅雷会员号242689460:1密码9322579
VIP分享网迅雷会员号146376319:2密码9780319
VIP分享网迅雷会员号845429536:1密码9736349
VIP分享网迅雷会员号113573595:2密码9594039
VIP分享网迅雷会员号798787259:2密码9645789
VIP分享网迅雷会员号902948934:1密码9920719
VIP分享网迅雷会员号893013018:1密码9516069
VIP分享网迅雷会员号123586914:1密码9937689
VIP分享网迅雷会员号421190942:2密码9519779
VIP分享网迅雷会员号842741066:2密码9796549
VIP分享网迅雷会员号731396967:1密码9000939
VIP分享网迅雷会员号838797970:2密码9168859
VIP分享网迅雷会员号198231qin:1密码9775239
VIP分享网迅雷会员号395922976:1密码9705719
VIP分享网迅雷会员号269560190:2密码9503149
VIP分享网迅雷会员号903066726:2密码9745109
VIP分享网迅雷会员号776447864:2密码9847259
VIP分享网迅雷会员号828859858:2密码9428479
VIP分享网迅雷会员号774256581:2密码9500049
VIP分享网迅雷会员号903142190:1密码9653049
VIP分享网迅雷会员号830649436:1密码9772179
VIP分享网迅雷会员号790262168:1密码9601009
VIP分享网迅雷会员号834895524:1密码9267899
VIP分享网迅雷会员号840313877:1密码9178639
VIP分享网迅雷会员号896151846:2密码9074189
VIP分享网迅雷会员号793535475:2密码9260239
VIP分享网迅雷会员号149244502:1密码9105749
VIP分享网迅雷会员号894138315:2密码9822639
VIP分享网迅雷会员号895467021:1密码9654069
VIP分享网迅雷会员号13829266904:2密码9290379
VIP分享网迅雷会员号353615200:1密码9749319
VIP分享网迅雷会员号311274570:1密码9656289
VIP分享网迅雷会员号893243411:1密码9161039
VIP分享网迅雷会员号308349140:2密码9933679
VIP分享网迅雷会员号892730567:2密码9770929
VIP分享网迅雷会员号211708473:1密码9162589
VIP分享网迅雷会员号838462015:1密码9523929
VIP分享网迅雷会员号758415243:2密码9121699
VIP分享网迅雷会员号696094939:2密码9015009
VIP分享网迅雷会员号916301164:1密码9800369
VIP分享网迅雷会员号894998314:1密码9753019

详细可以查看执行后的数据文件：phpspider_data.txt

网络爬虫实现抓取迅雷VIP账号密码