虽然早就知道很多人用 Guzzle 爬数据,但是我却从来没有真正实践过,因为在我的潜意识里,抓取是 Python 的地盘。不过前段时间,当我抓汽车之家数据的时候,好心人跟我提起 Goutte 搭配 Guzzle 是最好的爬虫,让我一直记挂在心上,加上最近打算更新一下车型数据,于是我便重写了抓取汽车之家数据的脚本。
因为我是通过接口抓取,而不是网页,所以暂时用不上 Goutte,只用 Guzzle 就可以了,抓取过程中需要注意两点:首先需要注意的是通过并发节省时间,其次需要注意的是失败重试的步骤。算了,我不想说了,直接贴代码吧。
代码语言:javascript复制<?php
require "vendor/autoload.php";
use GuzzleHttpPool;
use GuzzleHttpClient;
use GuzzleHttpMiddleware;
use GuzzleHttpHandlerStack;
use GuzzleHttpPsr7Request;
// 品牌
$brands = [];
// 车系
$series = [];
// 车型
$models = [];
// 配置
$configs = [];
$timeout = 10;
$concurrency = 100;
ini_set("memory_limit", "512M");
$stack = HandlerStack::create();
$stack->push(Middleware::retry(
function($retries) { return $retries < 3; },
function($retries) { return pow(2, $retries - 1); }
));
$client = new Client([
"debug" => true,
"timeout" => $timeout,
"base_uri" => "https://cars.app.autohome.com.cn",
"headers" => [
"User-Agent" => "Androidt6.0.1tautohomet8.3.0tAndroid",
],
"handler" => $stack,
]);
// 品牌列表页
$url = "/cars_v8.3.0/cars/brands-pm2.json";
$response = $client->get($url);
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"]["brandlist"];
foreach ($contents as $values) {
$initial = $values["letter"];
foreach ($values["list"] as $v) {
$brands[$v["id"]] = [
"id" => $v["id"],
"name" => $v["name"],
"initial" => $initial,
];
}
}
$brands = array_values($brands);
###
$requests = function ($brands) {
foreach ($brands as $v) {
$id = $v["id"];
// 品牌介绍页
$url = "/cars_v8.3.0/cars/getbrandinfo-pm2-b{$id}.json";
yield new Request("GET", $url);
}
};
$pool = new Pool($client, $requests($brands), [
"concurrency" => $concurrency,
"fulfilled" => function ($response, $index) use(&$brands) {
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"]["list"];
$contents = $contents ? $contents[0]["description"] : "暂无";
$contents = trim(str_replace(["rn", ","], ["n", ","], $contents));
$brands[$index]["description"] = $contents;
},
]);
$pool->promise()->wait();
$requests = function ($brands) {
foreach ($brands as $v) {
$id = $v["id"];
// 车系列表页
$url = "/cars_v8.3.0/cars/seriesprice-pm2-b{$id}-t16-v8.3.0.json";
yield new Request("GET", $url);
}
};
$pool = new Pool($client, $requests($brands), [
"concurrency" => $concurrency,
"fulfilled" => function ($response, $index) use(&$series, $brands) {
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"];
$brand_id = $brands[$index]["id"];
foreach (["fctlist", "otherfctlist"] as $field) {
$values = $contents[$field];
foreach ($values as $value) {
$factory = $value["name"];
foreach ($value["serieslist"] as $v) {
list($min, $max) = explode("-", $v["price"]) [1 => 0];
$min_price = $min * 10000;
$max_price = $max * 10000;
if ($max_price == 0) {
$max_price = $min_price;
}
$series[$v["id"]] = [
"id" => $v["id"],
"name" => $v["name"],
"level" => $v["levelname"],
"factory" => $factory,
"min_price" => $min_price,
"max_price" => $max_price,
"brand_id" => $brand_id,
];
}
}
}
},
]);
$pool->promise()->wait();
$series = array_values($series);
###
$requests = function ($series) {
foreach ($series as $v) {
$id = $v["id"];
// 车型列表页
$url = "/carinfo_v8.3.0/cars/seriessummary-pm2-s{$id}-t-c110100-v8.3.0.json";
yield new Request("GET", $url);
}
};
$pool = new Pool($client, $requests($series), [
"concurrency" => $concurrency,
"fulfilled" => function ($response, $index) use(&$models, $series) {
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"]['enginelist'];
$series_id = $series[$index]["id"];
foreach ($contents as $values) {
if (in_array($values["yearvalue"], [0, 1])) {
continue;
}
foreach ($values["yearspeclist"] as $value) {
foreach ($value["speclist"] as $v) {
if (isset($models[$v["id"]])) {
continue;
}
$price = $v["price"] * 10000;
$description = trim($v["description"]);
if (!$description) {
$description = "暂无";
}
$models[$v["id"]] = [
"id" => $v["id"],
"name" => $v["name"],
"status" => $v["state"],
"price" => $price,
"description" => $description,
"series_id" => $series_id,
];
}
}
}
},
]);
$pool->promise()->wait();
$models = array_values($models);
###
$requests = function ($models) {
foreach ($models as $v) {
$id = $v["id"];
// 车型参数页
$url = "/cfg_v8.3.0/cars/speccompare.ashx?pm=2&type=1&specids={$id}&cityid=110100&site=2&pl=2";
yield new Request("GET", $url);
}
};
$pool = new Pool($client, $requests($models), [
"concurrency" => $concurrency,
"fulfilled" => function ($response, $index) use(&$models, &$configs) {
$contents = $response->getBody()->getContents();
$contents = json_decode($contents, true);
$contents = $contents["result"];
$models[$index]["config"] = [];
foreach (["paramitems", "configitems"] as $key) {
$values = $contents[$key];
foreach ($values as $value) {
$category = $value["itemtype"];
foreach ($value["items"] as $v) {
$id = $v["id"];
if ($id < 1) {
continue;
}
$name = $v["name"];
$value = $v["modelexcessids"][0]["value"];
if ($value != "-") {
$models[$index]["config"][$id] = $value;
}
if (!isset($configs[$category][$id])) {
$configs[$category][$id] = [
"id" => $id,
"name" => $name,
"category" => $category,
];
}
}
}
}
$models[$index]["config"] = json_encode(
$models[$index]["config"], JSON_UNESCAPED_UNICODE
);
},
]);
$pool->promise()->wait();
$configs = call_user_func_array("array_merge", $configs);
// todo: 保存数据
?>
编写此类工具性质的脚本无需考虑面向对象之类的弯弯绕,一马平川的流水账往往是最好的选择。运行前记得先通过 composer 安装 guzzle,整个运行过程大概会执行三万次抓取请求,可以抓取汽车之家完整的品牌,车系,车型及配置等相关数据,总耗时大概十分钟左右,效率还是可以接受的。