使用Selenium爬取动态网页如何绕开CloudFlare 5秒盾【示例】

2023-05-20 14:24:49 浏览数 (3)

默认已经安装好chromeDrive

1. 导入引用

代码语言:javascript复制
 <!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>28.0-jre</version>
        </dependency>
        <dependency>
            <groupId>org.seleniumhq.selenium</groupId>
            <artifactId>selenium-java</artifactId>
            <version>3.141.59</version>
        </dependency>

2. 这里需要创建几个类,后面会用到

代码语言:javascript复制
public class ChromiumDriver extends RemoteWebDriver {

    public ChromiumDriver(Capabilities capabilities) {
        this(new ChromiumDriverCommandExecutor("goog", ChromeDriverService.createDefaultService()), capabilities, ChromeOptions.CAPABILITY);
    }

    protected ChromiumDriver(CommandExecutor commandExecutor, Capabilities capabilities, String capabilityKey) {
        super(commandExecutor, capabilities);

    }

    /**
     * Launches Chrome app specified by id.
     *
     * @param id Chrome app id.
     */
    public void launchApp(String id) {
        execute(ChromiumDriverCommand.LAUNCH_APP, ImmutableMap.of("id", id));
    }

    /**
     * Execute a Chrome Devtools Protocol command and get returned result. The
     * command and command args should follow
     * <a href="https://chromedevtools.github.io/devtools-protocol/">chrome devtools
     * protocol domains/commands</a>.
     */
    public Map<String, Object> executeCdpCommand(String commandName, Map<String, Object> parameters) {

        @SuppressWarnings("unchecked")
        Map<String, Object> toReturn = (Map<String, Object>) getExecuteMethod().execute(ChromiumDriverCommand.EXECUTE_CDP_COMMAND,
                ImmutableMap.of("cmd", commandName, "params", parameters));

        return ImmutableMap.copyOf(toReturn);
    }

    @Override
    public void quit() {
        super.quit();
    }
}
代码语言:javascript复制
final class ChromiumDriverCommand {
    private ChromiumDriverCommand() {}

    static final String LAUNCH_APP = "launchApp";
    static final String GET_NETWORK_CONDITIONS = "getNetworkConditions";
    static final String SET_NETWORK_CONDITIONS = "setNetworkConditions";
    static final String DELETE_NETWORK_CONDITIONS = "deleteNetworkConditions";
    static final String EXECUTE_CDP_COMMAND = "executeCdpCommand";

    // Cast Media Router APIs
    static final String GET_CAST_SINKS = "getCastSinks";
    static final String SET_CAST_SINK_TO_USE = "selectCastSink";
    static final String START_CAST_TAB_MIRRORING = "startCastTabMirroring";
    static final String GET_CAST_ISSUE_MESSAGE = "getCastIssueMessage";
    static final String STOP_CASTING = "stopCasting";

    static final String SET_PERMISSION = "setPermission";
}
代码语言:javascript复制
public class ChromiumDriverCommandExecutor extends DriverCommandExecutor {

    private static Map<String, CommandInfo> buildChromiumCommandMappings(String vendorKeyword) {
        String sessionPrefix = "/session/:sessionId/";
        String chromiumPrefix = sessionPrefix   "chromium";
        String vendorPrefix = sessionPrefix   vendorKeyword;

        HashMap<String, CommandInfo> mappings = new HashMap<>();

        mappings.put(ChromiumDriverCommand.LAUNCH_APP,
                new CommandInfo(chromiumPrefix   "/launch_app", HttpMethod.POST));

        String networkConditions = chromiumPrefix   "/network_conditions";
        mappings.put(ChromiumDriverCommand.GET_NETWORK_CONDITIONS,
                new CommandInfo(networkConditions, HttpMethod.GET));
        mappings.put(ChromiumDriverCommand.SET_NETWORK_CONDITIONS,
                new CommandInfo(networkConditions, HttpMethod.POST));
        mappings.put(ChromiumDriverCommand.DELETE_NETWORK_CONDITIONS,
                new CommandInfo(networkConditions, HttpMethod.DELETE));

        mappings.put( ChromiumDriverCommand.EXECUTE_CDP_COMMAND,
                new CommandInfo(vendorPrefix   "/cdp/execute", HttpMethod.POST));

        // Cast / Media Router APIs
        String cast = vendorPrefix   "/cast";
        mappings.put(ChromiumDriverCommand.GET_CAST_SINKS,
                new CommandInfo(cast   "/get_sinks", HttpMethod.GET));
        mappings.put(ChromiumDriverCommand.SET_CAST_SINK_TO_USE,
                new CommandInfo(cast   "/set_sink_to_use", HttpMethod.POST));
        mappings.put(ChromiumDriverCommand.START_CAST_TAB_MIRRORING,
                new CommandInfo(cast   "/start_tab_mirroring", HttpMethod.POST));
        mappings.put(ChromiumDriverCommand.GET_CAST_ISSUE_MESSAGE,
                new CommandInfo(cast   "/get_issue_message", HttpMethod.GET));
        mappings.put(ChromiumDriverCommand.STOP_CASTING,
                new CommandInfo(cast   "/stop_casting", HttpMethod.POST));

        mappings.put(ChromiumDriverCommand.SET_PERMISSION,
                new CommandInfo(sessionPrefix   "/permissions", HttpMethod.POST));

        return unmodifiableMap(mappings);
    }

    public ChromiumDriverCommandExecutor(String vendorPrefix, DriverService service) {
        super(service, buildChromiumCommandMappings(vendorPrefix));
    }
}

3. 开发工具类,获取ChromiumDriver

代码语言:javascript复制
public static ChromiumDriver getChromiumDriver() {
        // 设置谷歌浏览器驱动,我放在项目的路径下,这个驱动可以帮你打开本地的谷歌浏览器
        String driverFilePath = "你的chromedriver路径";
        if (!StringUtils.isEmpty(driverFilePath)){
            System.setProperty("webdriver.chrome.driver", driverFilePath);
        }


        // 设置对谷歌浏览器的初始配置 开始
        HashMap<String, Object> prefs = new HashMap<String, Object>();
        ChromeOptions options = new ChromeOptions();
        options.setExperimentalOption("prefs", prefs);
        String[] a = { "enable-automation" };
        options.setExperimentalOption("excludeSwitches", a);
        options.addArguments("--headless");
        options.addArguments("window-size=1920,1080");
        String ua="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36";
        options.addArguments(String.format("--user-agent=%s", ua));

        options.addArguments("--no-sandbox");
        options.addArguments("--disable-gpu");
        options.addArguments("--disable-dev-shm-usage");
        options.setBinary("/usr/bin/google-chrome");

        DesiredCapabilities chromeCaps = DesiredCapabilities.chrome();
        chromeCaps.setCapability(ChromeOptions.CAPABILITY, options);


        //执行cdp命令,修改webdriver的值为undefined
        ChromiumDriver driver = new ChromiumDriver(chromeCaps);
        HashMap<String, Object> cdpCmd = new HashMap<String, Object>();
        cdpCmd.put("source", "Object.defineProperty(navigator, 'webdriver', {get: () => undefined }); ");
        driver.executeCdpCommand("Page.addScriptToEvaluateOnNewDocument", cdpCmd);

        return driver;
    }

4. 开发工具类,获取网页

代码语言:javascript复制
   public static String convertHtml(String url) {
        ChromiumDriver chromiumDriver = getChromiumDriver();
        try {
            chromiumDriver.get(url);
            Thread.sleep(6000);
            return chromiumDriver.getPageSource();
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        } finally {
            chromiumDriver.quit();
        }
    }

发表时间:2023-05-19

本站文章除注明转载/出处外,皆为作者原创,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。

0 人点赞