Python爬虫JSON网址selenium实战笔记

仅供学习参考

一、获取特定文本和json链接

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 指定 Chrome 驱动程序路径
chrome_driver_path = r'F:\chrome-win64\chromedriver.exe'

# 创建 Chrome 浏览器的 WebDriver 实例
driver = webdriver.Chrome(executable_path=chrome_driver_path)

# 访问网页
url = "动态网页网址链接"
driver.get(url)

# 等待页面加载完成
driver.implicitly_wait(3)

# 使用显示等待,等待姓名元素出现
wait = WebDriverWait(driver, 10)  # 最长等待时间为10秒

for i in range(1, 100):
    name_xpath = f"/html/body/div[5]/ul[1]/li[{i}]/p/a"

    # 等待姓名元素出现
    name_element = wait.until(EC.visibility_of_element_located((By.XPATH, name_xpath)))

    name = name_element.text
    link = name_element.get_attribute("href")
    print(f"姓名: {name}, 链接: {link}")

# 关闭浏览器
driver.quit()

结果:现TXT文本内容

姓名:abc,链接:http://abc.json

二、打开现TXT文本,将姓名保留,把获得的新链接放入姓名之后,以新的文本输出

import os
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# 设置 ChromeDriver 路径
chrome_driver_path = 'F:\chrome-win64\chromedriver.exe'

# 创建 Chrome WebDriver 选项
options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")

# 创建 Chrome WebDriver 对象
driver = webdriver.Chrome(executable_path=chrome_driver_path, options=options)

# 读取包含多个网址的TXT文件
with open('网址.txt', 'r', encoding='utf-8') as file:
    content = file.read()

# 使用正则表达式提取姓名和网址
pattern = r'姓名: (.*?)\s+链接: (.*?)\n'
matches = re.findall(pattern, content, re.DOTALL)

# 创建新的TXT文件
output_file_path = '个人网址.txt'
if not os.path.exists(output_file_path):
    open(output_file_path, 'w').close()

# 打开新的TXT文件进行写入
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # 遍历每个匹配项
    for match in matches:
        name = match[0]
        url = match[1]

        try:
            driver.get(url)

            # 等待2秒
            time.sleep(2)

            # 使用 Selenium 获取动态生成的内容
            try:
                email_element = driver.find_element_by_id('需要填充') #根据目标网站修改
                email = email_element.text  #根据目标网站修改
            except:
                # 使用正则表达式查找邮箱地址
                email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'  #通用
                email_matches = re.findall(email_pattern, driver.page_source)
                if email_matches:
                    email = email_matches[0]
                else:
                    email = "无法提取邮箱地址"

            # 写入姓名和邮箱地址到文件
            output_file.write(f'{name}: {email}\n')
        except Exception as e:
            print(f"打开链接失败: {e}")

# 关闭浏览器
driver.quit()

结果:现TXT个人网址文本内容

abc: abc@qq.com

你可能感兴趣的:(python,python,爬虫,json)