1、安装EasySwoole 3.1.18版本
[root@ar414.com phpseleniumdemo] composer require easyswoole/easyswoole=3.1.18
[root@ar414.com phpseleniumdemo] php vendor/easyswoole/easyswoole/bin/easyswoole install
______ _____ _
| ____| / ____| | |
| |__ __ _ ___ _ _ | (___ __ __ ___ ___ | | ___
| __| / _` | / __| | | | | \___ \ \ \ /\ / / / _ \ / _ \ | | / _ \
| |____ | (_| | \__ \ | |_| | ____) | \ V V / | (_) | | (_) | | | | __/
|______| \__,_| |___/ \__, | |_____/ \_/\_/ \___/ \___/ |_| \___|
__/ |
|___/
install success,enjoy!
复制代码
2.安装核心库facebook/webdriver、easyswoole/curl
[root@ar414.com phpseleniumdemo]# composer require facebook/webdriver=1.7 [root@ar414.com phpseleniumdemo]# composer require easyswoole/curl=1.0.1 复制代码
3、确认运行没报错
[root@ar414.com phpseleniumdemo]# php easyswoole start | ____| / ____| | | | |__ __ _ ___ _ _ | (___ __ __ ___ ___ | | ___ >| __| / _` | / __| | | | | \___ \ \ \ /\ / / / _ \ / _ \ | | / _ \ >| |____ | (_| | \__ \ | |_| | ____) | \ V V / | (_) | | (_) | | | | __/ >|______| \__,_| |___/ \__, | |_____/ \_/\_/ \___/ \___/ |_| \___| > __/ | > |___/ main server SWOOLE_WEB listen address 0.0.0.0 listen port 9501 sub server1 CONSOLE => SWOOLE_TCP@127.0.0.1:9500 .... 复制代码
Tips:代理资源请自行解决,这里只提供例子,实际是用不了的
1、 创建项目主目录
[root@ar414.com phpseleniumdemo]# mkdir App #composer 指定App作用域 [root@ar414.com phpseleniumdemo]# cat composer.json { "autoload": { "psr-4": { "App\\": "App/" } }, "require": { "easyswoole/easyswoole": "3.1.18", "facebook/webdriver": "^1.7", "easyswoole/curl": "1.0.1" } } #更新composer autoload [root@ar414.com phpseleniumdemo]# composer dump-autoload 复制代码
2、创建进程目录(将代理池更新作为一个子进程随项目启动运行)
[root@ar414.com phpseleniumdemo]# mkdir App/Process 复制代码
3、代理池定时爬取(使用Redis List类型保证最新代理IP在头部,爬虫逻辑每次从头部获取,一个代理IP只用一次)
Tips:代理资源请自行解决,这里只提供例子,实际是用不了的
<?php /** * Created by PhpStorm. * User: ar414.com@gmail.com * Date: 2019/12/7 * Time: 21:00 */ namespace App\Process; use App\Lib\Curl; use App\Lib\Kv; use EasySwoole\Component\Process\AbstractProcess; class UpdateProxyPool extends AbstractProcess { //这里的代理IP都只支持socks5协议 private $proxyListApi = "http://www.zdopen.com/ShortS5Proxy/GetIP/?api=%s&akey=%s&order=2&type=3"; const PROXY_KV_KEY = 'spider:proxy:list'; const TIMER = 15; protected function initProxyListApi() { // $this->proxyListApi = sprintf($this->proxyListApi,$_ENV['PROXY_LIST_API'],$_ENV['PROXY_LIST_KEY']); $this->proxyListApi = sprintf($this->proxyListApi,20191231231237085,'72axxxae0fe34'); } public function run($arg) { $this->initProxyListApi(); //依赖 composer require easyswoole/curl=1.0.1 while (true) { $ret = Curl::get($this->proxyListApi); var_dump($ret); if($ret) { $ret = json_decode($ret,true); if($ret['code'] == 10001 && isset($ret['data']['proxy_list']) && !empty($ret['data']['proxy_list']) ) { foreach($ret['data']['proxy_list'] as $proxy) { $proxyItem = $proxy['ip'] . ':'.$proxy['port']; Kv::redis()->lPush(self::PROXY_KV_KEY,$proxyItem); } } } sleep(self::TIMER); } } } 复制代码
4、配置代理池更新进程随项目启动时启动(完整代码链接)
public static function mainServerCreate(EventRegister $register) { //更新代理池进程 ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\UpdateProxyPool('UpdateProxyPool', []))->getProcess()); } 复制代码
爬取列表页进程(完整代码链接)
<?php /** * Created by PhpStorm. * User: ar414.com@gmail.com * Date: 2019/12/7 * Time: 22:01 */ namespace App\Process; use App\Lib\ChromeDriver; use App\Lib\Kv; use EasySwoole\Component\Process\AbstractProcess; use EasySwoole\EasySwoole\Logger; class ListSpider extends AbstractProcess { const API = 'https://www.188-sb.com/SportsBook.API/web?lid=1&zid=3&pd=%23AC%23B151%23C1%23D50%23E10%23F163%23&cid=42&ctid=42'; const LIST_KV_KEY = 'spider:list'; const TIMER = 20; //20秒执行一次 public function run($arg) { while (true) { try { $driver = (new ChromeDriver(true))->getDriver(); $driver->get(self::API); $listStr = $driver->getPageSource(); var_dump($listStr); file_put_contents("/www/wwwroot/blog/phpseleniumdemo/listStr.html",$listStr); preg_match_all("/PD=(.*);/U",$listStr,$list); $list = array_unique($list[1]); if($list) { Kv::redis()->set(self::LIST_KV_KEY,json_encode($list)); } var_dump('done'); $driver->close(); $driver->quit(); } catch (\Throwable $throwable) { $driver->close(); $driver->quit(); Logger::getInstance()->log($throwable->getMessage(),'ListSpiderError'); var_dump($throwable->getMessage()); } sleep(self::TIMER); } } } 复制代码
1、完整代码链接
public static function mainServerCreate(EventRegister $register) { //更新代理池进程 ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\UpdateProxyPool('UpdateProxyPool', []))->getProcess()); //列表爬取进程 ServerManager::getInstance()->getSwooleServer()->addProcess((new \App\Process\ListSpider('ListSpider', []))->getProcess()); $register->set($register::onWorkerStart,function(\swoole_server $server,$workerId){ if($workerId == 0) { Timer::getInstance()->loop(30000, function () { $ret = Kv::redis()->get(ListSpider::LIST_KV_KEY); if($ret){ $ret = json_decode($ret,true); foreach($ret as $item) { TaskManager::async(function () use($item){ (new ItemSpider(true))->run($item); return true; }, function () use($item){ var_dump("{$item} Done"); }); } } }); } }); } 复制代码
2、ItemSpider逻辑代码(完整代码链接)
<?php /** * Created by PhpStorm. * User: ar414.com@gmail.com * Date: 2019/12/7 * Time: 22:35 */ namespace App\Spider; use App\Lib\ChromeDriver; use EasySwoole\EasySwoole\Logger; use Facebook\WebDriver\WebDriverBy; use Facebook\WebDriver\WebDriverExpectedCondition; class ItemSpider { public function run($itemPath) { $driver = (new ChromeDriver(true))->getDriver(); $itemPath = str_replace('#','/',$itemPath); $url = "https://www.188-sb.com/#{$itemPath}"; var_dump($url); try { $driver->get($url); $driver->wait(ChromeDriver::WAIT_SECONDS)->until( WebDriverExpectedCondition::visibilityOfElementLocated( WebDriverBy::className('gl-MarketGroupButton_Text') ) ); Logger::getInstance()->console("The title is '" . $driver->getTitle() . "'\n"); Logger::getInstance()->console("The current URI is '" . $driver->getCurrentURL() . "'\n"); $body = $driver->getPageSource(); var_dump($body); $driver->close(); $driver->quit(); //TODO 清洗数据 入库 } catch (\Throwable $throwable) { Logger::getInstance()->log($throwable->getMessage(),'Bet365ApiRun'); $driver->close(); $driver->quit(); } return; } } 复制代码
3、运行
[root@ar414.com phpseleniumdemo]# php easyswoole start
如果觉得我的文章对您有用,请随意打赏。你的支持将鼓励我继续创作!