当前位置:
首页 > 技术 > PHP多线程批量采集器

PHP多线程批量采集器

来吧,话不多说,我们只需要2个php文件即可实现!

先告诉大家,此采集方法参考了Github上 Xfeng 同学的 MultiHttpRequest 类,首先需要感谢这位同学,虽然之前有用过curl,但没想到可以多线程,又是一点点进步!

此采集程序均使用 php 自带函数(如果没有开启curl,请先开启curl),没有额外知识,开始吧!

文件布局如下:

PHP多线程批量采集器

首先编写核心文件(libs/class_curl_multi.php):

<?php
/*
* Curl 多线程类
* 使用方法:
* ========================
$urls = array("http://baidu.com", "http://dzone.com", "http://google.com");
$mp = new MultiHttpRequest($urls);
$mp->start();
* ========================
* 当然,如果你喜欢,还可以对此类进行扩展,
* 比如,如果需要用户登录才能采集的数据怎么办?
* 只要我们使用 curl 来做伪登录,把 cookie 保存到文件,
* 每次请求发送有效的 cookie 即可实现伪登录抓去数据!
*/
class MultiHttpRequest {
public $urls = array();
public $curlopt_header = 0;
public $method = "GET";

function __construct($urls = false) {
$this->urls = $urls;
}

function set_urls($urls) {
$this->urls = $urls;
return $this;
}

function is_return_header($b) {
$this->curlopt_header = $b;
return $this;
}

function set_method($m) {
$this->medthod = strtoupper($m);
return $this;
}

function start() {
if(!is_array($this->urls) or count($this->urls) == 0){
return false;
}
$curl = $text = array();
$handle = curl_multi_init();
foreach($this->urls as $k=>$v){
echo $k;
$curl[$k] = $this->add_handle($handle, $v);
}

$this->exec_handle($handle);
foreach($this->urls as $k=>$v){
$text[$k] = curl_multi_getcontent($curl[$k]);
curl_multi_remove_handle($handle, $curl[$k]);
}
curl_multi_close($handle);

return $text;
}

private function add_handle($handle, $url) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);

curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_multi_add_handle($handle, $curl);
return $curl;
}

private function exec_handle($handle) {
$flag = null;
do {
curl_multi_exec($handle, $flag);
} while ($flag > 0);
}

public function get_content($url){
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT,10);
return curl_exec($ch);
}
}

然后是入口文件(splider.php):

<?php
/**
* PHP通用多线程采集
*/
header("Content-type: text/html;charset=utf-8");
error_reporting(E_ALL);
ini_set('display_errors', true);
set_time_limit(0);

require("libs/class_curl_multi.php");

//连接数据库,密码填写自己数据库的密码
$link = mysql_connect("localhost","root","");
mysql_select_db("test",$link);

//清空数据库
mysql_query("TRUNCATE TABLE content");

mysql_query("SET NAMES 'utf8'"); //set charset

//域名前缀
$base = "http://xxx.com";

//需要采集的规则列表(分页)
$list = array(
'http://xxx.com/page/[1-2]'
);

//在列表页面内容链接表达式
$list_rules = '<li class="post box row ">.*?<a href="(.*?)".*?>.*?</a>.*?</li>';

//内容页面信息字段表达式
$detail_rules = array(
'meta_title'=>'<title>(.*?)</title>',
'meta_keywords'=>'<meta name="keywords" content="(.*?)" />',
'meta_description'=>'<meta name="description" content="(.*?)" />',
'product_name'=>'<h1>(.*?)</h1>',
'product_image'=>'<div class="v-inner">.*?<a href="(.*?)" id="originalImg"><img src=".*?" alt=".*?" /></a>.*?</div>',
'product_price'=>'<span class="info_views info_ico">(.*?)</span>',
'product_description'=>'<div class="context">(.*?)</div>',
);

//实例
$mp = new MultiHttpRequest();

//调试使用记录采集条目
$j = 1;

//每次并发几个链接
$limit = 10;

// 分页时被跳过的页数
$last_page = 0;

//开始采集
foreach ($list as $link) {

//解析列表页数
preg_match_all('/\[(.*)\]/i',$link,$_page);
if($_page[1][0]==''){
continue;
}
$pages = explode('-',$_page[1][0]);
if(count($pages) != 2){
continue;
}

$urls = array();

for($i=$pages[0];$i<=$pages[1] || $last_page === 1;$i++){
if(count($urls) < $limit && $last_page === 0){
$urls[] = preg_replace('/\[(.*)\]/i',$i,$link);
if($i < $pages[1]){
continue;
}
}

// var_dump($urls);echo '<br/>';
$last_page = 0;

//采集列表内容
$mp->set_urls($urls);
$contents = $mp->start();

foreach ($contents as $content) {
$content = _prefilter($content);
//debug
// exit($content);

//匹配内容
preg_match_all('/'.str_replace('/', '\/', addslashes($list_rules)).'/i',$content,$pregArr);

$detail_urls = array();
foreach($pregArr[1] as $detail_key=>$detail_value){
$data = array();
if(count($detail_urls) < $limit ){
$content_url = $detail_value;
$detail_urls[$content_url] = $content_url;
if($pregArr[1][$detail_key+1] != ''){
continue;
}
}

// var_dump($detail_urls);echo '<br/>';
//continue;
$mp->set_urls($detail_urls);

$details = $mp->start();
//图片路径临时存放
$images_urls = array();

//采集内容页面
foreach ($details as $url_key=>$detail) {
$detail = _prefilter($detail);
//debug
// exit($detail);

foreach ($detail_rules as $key => $value) {

preg_match_all('/'.str_replace('/', '\/', addslashes($value)).'/i',$detail,$detailArr);

//处理特殊这段信息
switch ($key) {
case 'product_image':
$data[$key] = "images/".md5($detailArr[1][0]).".jpg";
if(!file_exists($data[$key])){
$images_urls[$data[$key]] = $detailArr[1][0];
}
break;
case 'product_description':
$data[$key] = trim(strip_tags($detailArr[1][0]));
break;
default:
$data[$key] = $detailArr[1][0];
break;
}

}

//产品url
$data['product_url'] = $url_key;

//转义采集后的数据
foreach ($data as $_k => $_v) {
$data[$_k] = addslashes($_v);
}

//入库
$r = mysql_query("
insert into `content` values(
null,
'{$data['meta_title']}',
'{$data['meta_keywords']}',
'{$data['meta_description']}',
'{$data['product_name']}',
'{$data['product_image']}',
'{$data['product_price']}',
'{$data['product_description']}',
'{$data['product_url']}')");

//打印log
_flush($j++."|".$r."|".$data['product_name']."<br/>");
//_flush($data);
}
//远程图片本地化
$mp->set_urls($images_urls);
$images = $mp->start();
foreach ((array)$images as $image_key => $image_value) {
if (!empty($image_key)) {
_flush("store image:".$image_key."<br/>");
file_put_contents($image_key,$image_value);
}
}
//清空内容url并加入本次循环url。不然本次会被跳过
$content_url = $detail_value;
$detail_urls = array($content_url=>$content_url);
}
}
//清空内容url并加入本次循环url。不然本次会被跳过
if ($i == $pages[1] && ($pages[1] - $i) % $limit > 0) {
$last_page = 1;
}
$urls = array(preg_replace('/\[(.*)\]/i',$i,$link));
}
}

// 输出日志
function _flush($msg) {
print_r ($msg);
ob_flush();
flush();
}

// 对抓去到的内容做简单过滤(过滤空白字符,便于正则匹配)
function _prefilter($output) {
$output=preg_replace("/\/\/[\S\f\t\v ]*?;[\r|\n]/", "", $output);
$output=preg_replace("/\<\!\-\-[\s\S]*?\-\-\>/", "", $output);
$output=preg_replace("/\>[\s]+\</", "><", $output);
$output=preg_replace("/;[\s]+/", ";", $output);
$output=preg_replace("/[\s]+\}/", "}", $output);
$output=preg_replace("/}[\s]+/", "}", $output);
$output=preg_replace("/\{[\s]+/", "{", $output);
$output=preg_replace("/([\s]){2,}/", "$1", $output);
$output=preg_replace("/[\s]+\=[\s]+/", "=", $output);
return $output;
}
?>

准备好这两个文件后,我们还要准备以下数据库,需要创建一个名为”www_curlmulti“的数据库,然后执行以下语句创建表:

--
-- 数据库: `test`
--

-- --------------------------------------------------------

--
-- 表的结构 `content`
--

CREATE TABLE IF NOT EXISTS `content` (
`id` int(9) NOT NULL AUTO_INCREMENT,
`meta_title` varchar(500) CHARACTER SET utf8_general_ci NOT NULL,
`meta_keywords` text CHARACTER SET utf8_general_ci NOT NULL,
`meta_description` text CHARACTER SET utf8_general_ci NOT NULL,
`product_name` varchar(500) CHARACTER SET utf8_general_ci NOT NULL,
`product_image` varchar(500) CHARACTER SET utf8_general_ci NOT NULL,
`product_price` varchar(500) CHARACTER SET utf8_general_ci NOT NULL,
`product_description` text CHARACTER SET utf8_general_ci NOT NULL,
`product_url` varchar(500) CHARACTER SET utf8_general_ci NOT NULL,
UNIQUE KEY `id` (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 AUTO_INCREMENT=1 ;

OK!万事大吉!让我们运行起来吧:

1|1|test by krishna
2|1|Smart Rotary Leather Case for iPad 2, iPad 3 and iPad 4th Generation Black
3|1|Test by Vivek
4|1|Smart Cover Protective PU Three-fold Leather Case for iPad 2 Black
5|1|Test by Vivek1
6|1|test gaurav
7|1|Ctech 360 Degrees Rotating Stand Brown Stylish Grid Plaid Leather Case for iPad 2 or 3 2nd generation with Smart Cover
8|1|Ctech 360 Degrees Rotating Stand Leather Smart Case for Apple iPad 2 or3 Red Luxury Crocodile Pattern
9|1|Test by Vivek4
10|1|Test by Vivek3
store image:images/46d877b6c3c34ee0e479ec6df88a932d.jpg
store image:images/1d7ef11ff47a2686f80a6cc4b90da276.jpg
store image:images/ecbc975af37d4f0df0ebf25b114e4270.jpg
store image:images/6c1f8f801af39cdfeb9f2d69ca4aeac9.jpg
store image:images/7c164e1b931a1374c5195e502917f569.jpg
store image:images/3beef10b6c93a36e5602377a1d8da06a.jpg
store image:images/41607ec7987e62e4ae4b6afe09329738.jpg
store image:images/a9df41868eb7746d915eb6f16e64c5ed.jpg
store image:images/a952049b29cd6c02c351e663d9c06f35.jpg
store image:images/1b6594fd84ba7908193a1696bf8b8002.jpg
11|1|Ctech 360 Degrees Rotating Stand Leather Case for Apple iPad 2 and iPad 3 Purple Luxury Crocodile Pattern
12|1|Four-fold PU Leather Case with Sleep/Wake-up for iPad 2 Black
store image:images/c4b22e632971aa62c2eb724839721dcc.jpg
store image:images/167e3289da9cbdec78dbea9caca99ba3.jpg
13|1|Test by Vivek2
14|1|Ctech 360 Degrees Rotating black Leather iPad Case
15|1|Ctech 360 Degrees Rotating Pink Crocodile Leather Case Cover w/ Swivel Stand for iPad 2 or 3
16|1|Ctech 360 Degrees Rotating Case Blue Crocodile Leather Smart Cover with Swivel Stand
store image:images/e176d1573d73cf239ca2a72ba60186ca.jpg
store image:images/c40833752661cfc6be5ddd83a85df896.jpg
store image:images/dc2786c88d9f01c723ade1dacbf3ceaf.jpg
store image:images/eff2b7834f7832255c76d00ffcdf5a8e.jpg

我们得到这样的结果,然后看看 images 文件夹:(注:图片有些没下载到很正常,超时导致的)

PHP多线程批量采集器

很好,图片采集到了,数据呢?看看数据库:

PHP多线程批量采集器

后面的就省略吧,2页20条数据完美采集!动手吧!说不定明天你就有了一个千万级数据的站点(虽然是“海盗”行为)!

附件戳这里:PHP batch Collect Data Tool

 

PHP多线程批量采集器:等您坐沙发呢!

发表评论

您必须 [ 登录 ] 才能发表留言!