设计一个基于多个带标签的Snowflake SQL模板作为配置文件和多组参数的Python Pandas代码程序,实现根据不同的输入参数自动批量地将Snowflake数据库中的数据导出为CSV文件到指定目录上,然后逐个文件压缩为zip文件,标签和多个参数(以“_”分割)为组成导出数据文件名,文件已经存在则覆盖原始文件。需要考虑SQL结果集是大数据量分批数据导出的情况,通过多线程和异步操作来提高程序性能,程序需要异常处理和输出,输出出错时的错误信息,每次每个查询导出数据的运行状态和表数据行数以及运行时间戳,导出时间,输出每个文件记录数量的日志。
Python Pandas程序实现基于多个带标签的Snowflake SQL模板和参数组的数据导出、压缩及日志记录功能:
import os
import re
import logging
import zipfile
import pandas as pd
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import snowflake.connector
import yaml
# 日志配置
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - [%(label)s] [%(params)s] %(message)s",
handlers=[
logging.FileHandler("data_export.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def load_config(config_path):
"""加载YAML配置文件"""
with open(config_path) as f:
return yaml.safe_load(f)
def sanitize_filename(value):
"""清理文件名中的特殊字符"""
return re.sub(r'[^a-zA-Z0-9_-]', '_', str(value))
def generate_filename(label, params):
"""生成标准化文件名"""
sorted_params = sorted(params.items())
param_str = "_".join(f"{sanitize_filename(v)}" for _, v in sorted_params)
return f"{label}_{param_str}"
def export_to_csv_zip(args):
"""处理单个导出任务"""
label, sql_template, params, output_dir, batch_size = args
filename = generate_filename(label, params)
csv_path = os.path.join(output_dir, f"{filename}.csv")
zip_path = os.path.join(output_dir, f"{filename}.zip")
# 清理已存在的文件
for path in [csv_path, zip_path]:
if os.path.exists(path):
os.remove(path)
conn = None
total_rows = 0
start_time = datetime.now()
log_extra = {"label": label, "params": str(params)}
try:
logger.info(f"Export started", extra=log_extra)
# 初始化Snowflake连接
conn = snowflake.connector.connect(
user=os.getenv("SNOWFLAKE_USER"),
password=os.getenv("SNOWFLAKE_PASSWORD"),
account=os.getenv("SNOWFLAKE_ACCOUNT"),
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
database=os.getenv("SNOWFLAKE_DATABASE"),
schema=os.getenv("SNOWFLAKE_SCHEMA")
)
cursor = conn.cursor()
# 执行参数化查询
cursor.execute(sql_template, params)
# 分批处理数据
columns = [col.name for col in cursor.description]
header = True
while True:
rows = cursor.fetchmany(batch_size)
if not rows:
break
df = pd.DataFrame(rows, columns=columns)
df.to_csv(
csv_path,
mode="a",
index=False,
header=header,
encoding="utf-8"
)
batch_rows = len(df)
total_rows += batch_rows
header = False
logger.info(
f"Exported {batch_rows} rows (Total: {total_rows})",
extra=log_extra
)
# 压缩文件
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
zf.write(csv_path, arcname=os.path.basename(csv_path))
duration = datetime.now() - start_time
logger.info(
f"Export completed: {total_rows} rows in {duration}",
extra=log_extra
)
except Exception as e:
logger.error(
f"Export failed: {str(e)}",
extra=log_extra,
exc_info=True
)
finally:
if conn:
conn.close()
if os.path.exists(csv_path):
os.remove(csv_path)
def main():
# 加载配置
config = load_config("config/sql_templates.yaml")
params_config = load_config("config/params.yaml")
# 准备输出目录
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
# 创建任务队列
tasks = []
batch_size = 100000 # 每批次处理行数
for template in config["templates"]:
label = template["label"]
sql = template["sql"]
params_list = next(
(p["params"] for p in params_config["parameters"]
if p["label"] == label), []
for params in params_list:
tasks.append((
label,
sql,
params,
output_dir,
batch_size
))
# 使用线程池并行处理
with ThreadPoolExecutor(max_workers=4) as executor:
executor.map(export_to_csv_zip, tasks)
if __name__ == "__main__":
main()
配套配置文件示例
config/sql_templates.yaml
:
templates:
- label: sales_data
sql: |
SELECT *
FROM sales
WHERE region = %(region)s
AND sale_date BETWEEN %(start_date)s AND %(end_date)s
- label: customer_info
sql: |
SELECT *
FROM customers
WHERE status = %(status)s
AND created_at >= %(min_created)s
config/params.yaml
:
parameters:
- label: sales_data
params:
- region: "NORTH"
start_date: "2023-01-01"
end_date: "2023-03-31"
- region: "SOUTH"
start_date: "2023-04-01"
end_date: "2023-06-30"
- label: customer_info
params:
- status: "ACTIVE"
min_created: "2022-01-01"
- status: "INACTIVE"
min_created: "2020-01-01"
程序特点说明
安全参数处理:
高效分批处理:
并行处理:
可靠的错误处理:
完整的日志系统:
文件管理:
使用说明
环境准备:
pip install pyyaml pandas snowflake-connector-python
配置文件:
运行程序:
python data_export.py
结果验证:
性能优化建议
根据数据库性能调整以下参数:
对于超大数据集可考虑:
监控建议: