在跨境电商运营中(划重点)→商品价格、用户评价、库存数据就是新时代的石油!但很多平台的反爬机制比女朋友还难哄(懂的都懂)今天教大家用代理IP+Python的组合拳,轻松拿下这些珍贵数据!
推荐使用青果代理/亮数据(亲测稳定)免费代理?别闹!分分钟IP被封到你怀疑人生(血泪教训)!!!
# 必备库安装(用清华镜像更快哦~)
pip install requests beautifulsoup4 fake_useragent -i https://pypi.tuna.tsinghua.edu.cn/simple
from fake_useragent import UserAgent
import requests
headers = {
# 随机生成浏览器指纹
'User-Agent': UserAgent().random,
# 重要!模拟AJAX请求
'X-Requested-With': 'XMLHttpRequest'
}
proxy_list = [
'http://username:[email protected]:8888',
'http://username:[email protected]:8888',
# 至少准备20个IP交替使用
]
def get_with_proxy(url):
for proxy in proxy_list:
try:
response = requests.get(
url,
headers=headers,
proxies={'http': proxy, 'https': proxy},
timeout=10 # 超时设置不能省!
)
if response.status_code == 200:
return response.text
except Exception as e:
print(f"代理 {proxy} 失效:{str(e)}")
return None # 所有代理都挂了的情况
from bs4 import BeautifulSoup
def parse_product_page(html):
soup = BeautifulSoup(html, 'lxml')
# 商品价格(注意动态加载情况)
price = soup.select_one('.price-section span').text.strip()
# 评价数量(注意反爬class名变化)
reviews = soup.find('div', {'data-testid': 'review-count'}).text
return {'price': price, 'reviews': reviews}
# 需要先安装selenium
from selenium.webdriver import ChromeOptions
options = ChromeOptions()
options.add_argument('--headless') # 无头模式
options.add_argument(f'--proxy-server={random.choice(proxy_list)}')
driver = webdriver.Chrome(options=options)
driver.get(url)
# 关键!等待动态内容加载
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'product-detail'))
)
import csv
from datetime import datetime
def save_data(data):
today = datetime.now().strftime('%Y%m%d')
filename = f'product_data_{today}.csv'
# 自动追加模式+创建文件头
with open(filename, 'a', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
if f.tell() == 0: # 文件为空时写入表头
writer.writeheader()
writer.writerow(data)
import hashlib
def generate_fingerprint():
timestamp = str(int(time.time()*1000))
secret = '某网站加密参数(需要逆向分析)'
sign = hashlib.md5((timestamp + secret).encode()).hexdigest()
return {'t': timestamp, 'sign': sign}
虽然技术无罪,但使用需谨慎:
Q:突然返回403错误怎么办?
A:检查这三处:
Q:数据出现乱码?
A:尝试这几种编码:
response.encoding = response.apparent_encoding # 自动检测
# 或手动指定
response.encoding = 'gbk'/'utf-8'/'gb2312'
爬虫不是法外之地!本文代码仅供学习交流,实际使用时请:
下期预告:《用Scrapy分布式爬虫搭建价格监控系统》!关注不迷路~