酷笔记 > QueryList采集器开发皇冠体育是干嘛的|免费注册 > 4-4Multi 多线程插件

Multi 多线程插件


Multi 多线程插件

Multi扩展,可以实现多线程采集。

安装:

composer?require?jaeger/querylist-ext-multi

git地址:

Https://github.com/jae-jae/QueryList-Ext-Multi.git

依赖(通过Composer安装的请忽略)

Multi扩展依赖CurlMulti类,Git地址为:https://github.com/jae-jae/CurlMulti

用法一

?[
????????'http://cms.querylist.cc/news/it/547.html',
????????'http://cms.querylist.cc/news/it/545.html',
????????'http://cms.querylist.cc/news/it/543.html'
????????//更多的采集链接....
????],
????'curl'?=>?[
????????'opt'?=>?array(
????????????????????//这里根据自身需求设置curl参数
????????????????????CURLOPT_SSL_VERIFYPEER?=>?false,
????????????????????CURLOPT_SSL_VERIFYHOST?=>?false,
????????????????????CURLOPT_FOLLOWLOCATION?=>?true,
????????????????????CURLOPT_AUTOreferer?=>?true,
????????????????????//........
????????????????),
????????//设置线程数
????????'maxThread'?=>?100,
????????//设置最大尝试数
????????'maxTry'?=>?3?
????],
????'success'?=>?function($a){
????????//采集规则
????????$reg?=?array(
????????????//采集文章标题
????????????'title'?=>?array('h1','text'),
????????????//采集文章正文内容,利用过滤功能去掉文章中的超链接,但保留超链接的文字,并去掉版权、JS代码等无用信息
????????????'content'?=>?array('.post_content','html','a?-.content_copyright?-script'?)
????????????);
????????$rang?=?'.content';
????????$ql?=?QueryList::Query($a['content'],$reg,$rang);
????????$data?=?$ql->getData();
????????//打印结果,实际操作中这里应该做入数据库操作
????????print_r($data);
????}
]);

用法二

?[
????????'http://cms.querylist.cc/news/it/547.html',
????????'http://cms.querylist.cc/news/it/545.html',
????????'http://cms.querylist.cc/news/it/543.html'
????????//更多的采集链接....
????],
????'curl'?=>?[
????????'opt'?=>?array(
????????????????????CURLOPT_SSL_VERIFYPEER?=>?false,
????????????????????CURLOPT_SSL_VERIFYHOST?=>?false,
????????????????????CURLOPT_FOLLOWLOCATION?=>?true,
????????????????????CURLOPT_AUTOREFERER?=>?true,
????????????????),
????????//设置线程数
????????'maxThread'?=>?100,
????????//设置最大尝试数
????????'maxTry'?=>?3?
????],
????//不自动开始线程,默认自动开始
????'start'?=>?false,
????'success'?=>?function($html,$info){
????????//采集操作....
????},
????'error'?=>?function(){
????????//出错处理
????}
]);
//再额外添加一些采集链接
$cm->add([
????????'http://cms.querylist.cc/news/it/532.html',
????????'http://cms.querylist.cc/news/it/528.html',
????????'http://cms.querylist.cc/news/other/530.html'
????],function($html,$info){
????????//sucess
????????//可选的,不同的采集操作....
????},
????function(){
????????//error
????????//可选的,不同的出错处理
????});
//开始采集
$cm->start();

用法三

maxThread?=?100;
$data?=?QueryList::run('Request',array(
????'http'?=>array(
????????'target'?=>?$url,
????????'referrer'=>$url,
????????'user_agent'=>'Mozilla/5.0?(X11;?Linux?x86_64)?AppleWebKit/535.11?(KHTML,?like?Gecko)?Ubuntu/11.10?Chromium/27.0.1453.93?chrome/27.0.1453.93?Safari/537.36',
????????'cookiePath'?=>?'./cookie.txt'
????),
????'callback'?=>?function($html){
????????return?preg_replace('/.+<\/head>/is','',$html);
????}
))->setQuery(array('title'=>['h2?a','text'],'link'=>['h2?a','href']))->getData(function($item)?use($curl){
????//?if(!StudyModel::exist($item['title'])){
????????$curl->add(['url'?=>?$item['link']],function($a){
????????????$html?=?preg_replace('/.+<\/head>/is','',$a['content']);
????????????$data?=?QueryList::Query($html,array('title'=>['.entry_title','text'],'content'=>['.post','html','-#headline?-script?-h3.post_tags?-.copyright?-.wumii-hook?a']))->getData();
????????????//?echo?StudyModel::insert($data[0]['title'],$data[0]['content'],$a['info']['url']);
????????????print_r($data);
????????});
????//?}
});
$curl->start();