JAVA的Selenium自动化爬取TK数据收集-----JAVA



    4.0.0
    
        org.springframework.boot
        spring-boot-starter-parent
        3.4.3
         
    
    com.alatus
    TiktokCrawl
    0.0.1-SNAPSHOT
    TiktokCrawl
    TiktokCrawl
    
    
        
    
    
        
    
    
        
        
        
        
    
    
        17
    
    
        
            org.springframework.boot
            spring-boot-starter-web
        
        
            org.seleniumhq.selenium
            selenium-java
            4.15.0
        
        
            io.github.bonigarcia
            webdrivermanager
            5.6.3
        
        
            org.projectlombok
            lombok
            true
        
    

    
    



    4.0.0
    
        org.springframework.boot
        spring-boot-starter-parent
        3.4.3
         
    
    com.alatus
    TiktokCrawl
    0.0.1-SNAPSHOT
    TiktokCrawl
    TiktokCrawl
    
    
        
    
    
        
    
    
        
        
        
        
    
    
        17
    
    
        
            org.springframework.boot
            spring-boot-starter-web
        
        
            org.seleniumhq.selenium
            selenium-java
            4.15.0
        
        
            io.github.bonigarcia
            webdrivermanager
            5.6.3
        
        
            org.projectlombok
            lombok
            true
        
    

    
    

package com.alatus.tiktokcrawl;

import com.alatus.tiktokcrawl.Crawl.TikTokCrawl;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class TiktokCrawlApplication {

    public static void main(String[] args) {
        new TikTokCrawl().crawl();
        SpringApplication.run(TiktokCrawlApplication.class, args);
    }

}

 

package com.alatus.tiktokcrawl;

import com.alatus.tiktokcrawl.Crawl.TikTokCrawl;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication
public class TiktokCrawlApplication {

    public static void main(String[] args) {
        new TikTokCrawl().crawl();
        SpringApplication.run(TiktokCrawlApplication.class, args);
    }

}
package com.alatus.tiktokcrawl.Crawl;

import lombok.Data;
import org.openqa.selenium.*;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.interactions.Actions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.time.Duration;
import java.util.Arrays;
import java.util.List;

@Data
public class TikTokCrawl {
    public void crawl() {
        // 设置EdgeDriver路径
        System.setProperty("webdriver.edge.driver", "c:/msedgedriver.exe");

        // 初始化WebDriver
        WebDriver driver = new EdgeDriver();

        try {
            // 打开TikTok网页
            driver.get("https://www.tiktok.com/@sdasdxzc");
            Thread.sleep(5000);

            // 设置Cookie
            String cookiesString = "";
            List cookiesList = Arrays.asList(cookiesString.split("; "));

            for (String cookieString : cookiesList) {
                String[] cookieParts = cookieString.split("=");
                if (cookieParts.length == 2) {
                    String name = cookieParts[0];
                    String value = cookieParts[1];
                    Cookie cookie = new Cookie(name, value);
                    driver.manage().addCookie(cookie);
                }
            }

            // 刷新页面以应用Cookie
            driver.navigate().refresh();
            Thread.sleep(5000);


            // 页面疯狂下拉,让它加载新东西
            JavascriptExecutor js = (JavascriptExecutor) driver;
            long lastHeight = (long) js.executeScript("return document.body.scrollHeight");

            while (true) {
                js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
                Thread.sleep(2000); // 等待页面加载

                long newHeight = (long) js.executeScript("return document.body.scrollHeight");
                if (newHeight == lastHeight) {
                    break;
                }
                lastHeight = newHeight;
            }

            WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
            WebElement parentDiv = wait.until(ExpectedConditions.presenceOfElementLocated(
                    By.cssSelector("div[data-e2e='user-post-item-list']")
            ));

            // 获取该 div 元素的所有子节点
            List children = parentDiv.findElements(By.xpath("./*"));

            // 输出每个子节点的信息
            for (WebElement child : children) {
                // 查找子元素中的 a 标签
                List links = child.findElements(By.tagName("a"));

                for (WebElement link : links) {
                    // 获取 a 标签的 href 属性(视频链接)
                    String videoLink = link.getAttribute("href");

                    if (videoLink != null && !videoLink.isEmpty()) {
                        System.out.println("Video Link: " + videoLink);
                    }
                }

                // 尝试查找 video-count 的元素
                List videoCountElements = child.findElements(By.cssSelector(".video-count"));
                for (WebElement videoCountElement : videoCountElements) {
                    String videoCount = videoCountElement.getText();
                    System.out.println("Video Count: " + videoCount);
                }
            }

            // 页面拉到底部以后,获取页面上的信息
            String pageSource = driver.getPageSource();

            // 模拟鼠标移动
            Actions actions = new Actions(driver);
            actions.moveByOffset(100, 100).perform();

        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            // 关闭浏览器
            driver.quit();
        }
    }
}

 

package com.alatus.tiktokcrawl.Crawl;

import lombok.Data;
import org.openqa.selenium.*;
import org.openqa.selenium.edge.EdgeDriver;
import org.openqa.selenium.interactions.Actions;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import java.time.Duration;
import java.util.Arrays;
import java.util.List;

@Data
public class TikTokCrawl {
    public void crawl() {
        // 设置EdgeDriver路径
        System.setProperty("webdriver.edge.driver", "c:/msedgedriver.exe");

        // 初始化WebDriver
        WebDriver driver = new EdgeDriver();

        try {
            // 打开TikTok网页
            driver.get("https://www.tiktok.com/@sdasdxzc");
            Thread.sleep(5000);

            // 设置Cookie
            String cookiesString = "";
            List cookiesList = Arrays.asList(cookiesString.split("; "));

            for (String cookieString : cookiesList) {
                String[] cookieParts = cookieString.split("=");
                if (cookieParts.length == 2) {
                    String name = cookieParts[0];
                    String value = cookieParts[1];
                    Cookie cookie = new Cookie(name, value);
                    driver.manage().addCookie(cookie);
                }
            }

            // 刷新页面以应用Cookie
            driver.navigate().refresh();
            Thread.sleep(5000);


            // 页面疯狂下拉,让它加载新东西
            JavascriptExecutor js = (JavascriptExecutor) driver;
            long lastHeight = (long) js.executeScript("return document.body.scrollHeight");

            while (true) {
                js.executeScript("window.scrollTo(0, document.body.scrollHeight);");
                Thread.sleep(2000); // 等待页面加载

                long newHeight = (long) js.executeScript("return document.body.scrollHeight");
                if (newHeight == lastHeight) {
                    break;
                }
                lastHeight = newHeight;
            }

            WebDriverWait wait = new WebDriverWait(driver, Duration.ofSeconds(10));
            WebElement parentDiv = wait.until(ExpectedConditions.presenceOfElementLocated(
                    By.cssSelector("div[data-e2e='user-post-item-list']")
            ));

            // 获取该 div 元素的所有子节点
            List children = parentDiv.findElements(By.xpath("./*"));

            // 输出每个子节点的信息
            for (WebElement child : children) {
                // 查找子元素中的 a 标签
                List links = child.findElements(By.tagName("a"));

                for (WebElement link : links) {
                    // 获取 a 标签的 href 属性(视频链接)
                    String videoLink = link.getAttribute("href");

                    if (videoLink != null && !videoLink.isEmpty()) {
                        System.out.println("Video Link: " + videoLink);
                    }
                }

                // 尝试查找 video-count 的元素
                List videoCountElements = child.findElements(By.cssSelector(".video-count"));
                for (WebElement videoCountElement : videoCountElements) {
                    String videoCount = videoCountElement.getText();
                    System.out.println("Video Count: " + videoCount);
                }
            }

            // 页面拉到底部以后,获取页面上的信息
            String pageSource = driver.getPageSource();

            // 模拟鼠标移动
            Actions actions = new Actions(driver);
            actions.moveByOffset(100, 100).perform();

        } catch (InterruptedException e) {
            e.printStackTrace();
        } finally {
            // 关闭浏览器
            driver.quit();
        }
    }
}

你可能感兴趣的:(软件架构设计,JAVA,#,Spring-Boot框架,spring,cloud,后端,spring,boot,jvm,分布式,selenium,爬虫)