一.百度识图自动上传图片
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
edge_options = Options()
edge_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
driver = webdriver.Edge(options=edge_options)
driver.get('https://graph.baidu.com/pcpage/index?tpl_from=pc')
driver.find_element(by=By.NAME, value='file').send_keys(r"D:\7.18\图1.jpg")
input('')
这段代码使用 Selenium 库实现了自动打开百度识图网页并上传本地图片的功能。下面是对代码的逐行解释:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
webdriver
用于控制浏览器,Options
用于配置浏览器选项,By
用于定位网页元素。
edge_options = Options()
edge_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
driver = webdriver.Edge(options=edge_options)
driver.get('https://graph.baidu.com/pcpage/index?tpl_from=pc')
driver.find_element(by=By.NAME, value='file').send_keys(r"D:\7.18\图1.jpg")
name
属性值为file
来查找)。send_keys()
方法模拟键盘输入,将本地图片的路径发送给上传元素,从而实现自动上传图片。input('')
上述代码使用了 Selenium WebDriver,它需要启动一个真实的浏览器来执行操作。如果只需要获取网页的静态资源(如 HTML 内容、JSON 数据等),可以使用更轻量级的库,如requests
:
元素交互方法:
click()
:模拟鼠标点击元素,常用于按钮、链接等可点击元素。send_keys(text)
:模拟键盘输入文本到输入框等元素中。clear()
:清空输入框中的内容。submit()
:提交表单,通常用于表单中的提交按钮。浏览器导航方法:
back()
:模拟浏览器的后退按钮,返回上一页。forward()
:模拟浏览器的前进按钮,前进到下一页。refresh()
:刷新当前页面。get(url)
:打开指定 URL 的网页。current_url
:获取当前页面的 URL。浏览器控制方法:
close()
:关闭当前浏览器窗口。quit()
:退出整个浏览器进程,关闭所有窗口。maximize_window()
:最大化浏览器窗口。set_window_size(width, height)
:设置浏览器窗口大小。元素定位方法:
find_element(By.ID, value)
:通过元素 ID 定位。find_element(By.NAME, value)
:通过元素 name 属性定位。find_element(By.CSS_SELECTOR, value)
:通过 CSS 选择器定位。find_element(By.XPATH, value)
:通过 XPath 表达式定位。find_elements()
:返回所有匹配的元素列表。import requests
url = 'https://graph.baidu.com/pcpage/index?tpl_from=pc'
response = requests.get(url)
if response.status_code == 200:
# 获取网页的HTML内容
html_content = response.text
print(html_content)
else:
print(f"请求失败,状态码:{response.status_code}")
二.批量获取 Excel 相关图书信息
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
def get_info(driver):
time.sleep(5)
eles_p = driver.find_elements(By.CLASS_NAME, 'book_item')
print(f"找到 {len(eles_p)} 个图书项") # 调试输出
for ele_p in eles_p:
ele_p.click()
handles = driver.window_handles
driver.switch_to.window(handles[-1])
time.sleep(5)
name = driver.find_element(By.CLASS_NAME, 'book-name').text
price = driver.find_element(By.CLASS_NAME, 'price').text
author = driver.find_element(By.CLASS_NAME, 'book-author').text
file.write(f'图书名:{name}\t价格:{price}\t作者名:{author}\n')
print(f"已保存:{name}") # 调试输出
driver.close()
driver.switch_to.window(handles[-2]) # 回到上一个标签页(索引-2)
file = open('excel图书汇总.txt', 'w', encoding='utf-8')
chrome_options = Options()
chrome_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
driver = webdriver.Edge(options=chrome_options)
driver.get('https://www.ptpress.com.cn/')
elements = driver.find_elements(By.TAG_NAME, "input")
elements[0].send_keys("excel" + Keys.RETURN)
handles = driver.window_handles
driver.switch_to.window(handles[1])
driver.find_element(By.ID, "booksMore").click()
handles = driver.window_handles
driver.switch_to.window(handles[-1])
get_info(driver)
page_num = 1 # 记录当前页码
while True:
try:
# 尝试查找下一页按钮
next_button = driver.find_element(By.CLASS_NAME, 'ivu-page-next')
# 检查按钮是否禁用(不同网站禁用状态的class可能不同,需要根据实际情况调整)
if 'ivu-page-disabled' in next_button.get_attribute('class'):
print(f"已到达最后一页(第{page_num}页),停止爬取")
break
next_button.click()
page_num += 1
print(f"已翻到第{page_num}页")
time.sleep(3) # 等待页面加载
get_info(driver)
except Exception as e:
print(f"爬取过程中出错:{e}")
print(f"最后成功爬取的是第{page_num}页")
break
file.close()
driver.quit() # 关闭浏览器
这段代码使用 Selenium 自动化浏览器操作,从人民邮电出版社网站批量获取 Excel 相关图书的信息,并保存到文本文件中。下面是对代码的详细解释:
这个程序主要分为以下几个部分:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
def get_info(driver):
time.sleep(5)
eles_p = driver.find_elements(By.CLASS_NAME, 'book_item')
print(f"找到 {len(eles_p)} 个图书项") # 调试输出
for ele_p in eles_p:
ele_p.click()
handles = driver.window_handles
driver.switch_to.window(handles[-1])
time.sleep(5)
name = driver.find_element(By.CLASS_NAME, 'book-name').text
price = driver.find_element(By.CLASS_NAME, 'price').text
author = driver.find_element(By.CLASS_NAME, 'book-author').text
file.write(f'图书名:{name}\t价格:{price}\t作者名:{author}\n')
print(f"已保存:{name}") # 调试输出
driver.close()
driver.switch_to.window(handles[-2]) # 回到上一个标签页(索引-2)
file = open('excel图书汇总.txt', 'w', encoding='utf-8')
chrome_options = Options()
chrome_options.binary_location = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
driver = webdriver.Edge(options=chrome_options)
driver.get('https://www.ptpress.com.cn/')
elements = driver.find_elements(By.TAG_NAME, "input")
elements[0].send_keys("excel" + Keys.RETURN)
handles = driver.window_handles
driver.switch_to.window(handles[1])
driver.find_element(By.ID, "booksMore").click()
handles = driver.window_handles
driver.switch_to.window(handles[-1])
get_info(driver)
page_num = 1 # 记录当前页码
while True:
try:
# 尝试查找下一页按钮
next_button = driver.find_element(By.CLASS_NAME, 'ivu-page-next')
# 检查按钮是否禁用
if 'ivu-page-disabled' in next_button.get_attribute('class'):
print(f"已到达最后一页(第{page_num}页),停止爬取")
break
next_button.click()
page_num += 1
print(f"已翻到第{page_num}页")
time.sleep(3) # 等待页面加载
get_info(driver)
except Exception as e:
print(f"爬取过程中出错:{e}")
print(f"最后成功爬取的是第{page_num}页")
break
file.close()
driver.quit() # 关闭浏览器
等待机制改进:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 等待图书项元素加载完成
eles_p = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, 'book_item'))
)
异常处理增强:
try:
name = driver.find_element(By.CLASS_NAME, 'book-name').text
except:
name = "未找到书名"
数据存储优化:
import csv
with open('books.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['图书名', '价格', '作者名'])
writer.writerow([name, price, author])
增加日志记录:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.info(f"找到 {len(eles_p)} 个图书项")
这个程序通过自动化浏览器操作,成功实现了批量获取图书信息的功能。通过合理的优化,可以进一步提高代码的稳定性和可维护性。