python小工具合集

小工具合集

    • 1.python切分excel
    • 2.python检查excel输出每列最大长度[目录下所有文件 or 目录下每个文件]

1.python切分excel

"""
@Project :pythonProject 
@File    :splitFile.py
@IDE     :PyCharm 
@Author  :alice
@Date    :2025/3/21 13:48 
"""
import pandas as pd
import os


def split_excel(input_path, output_prefix,total_num,single_one,num_columns):
    """
    分割大型Excel文件
    参数:
    input_path: 输入文件路径
    output_prefix: 输出文件前缀
    """
    #创建输出目录
    output_dir = os.path.dirname(output_prefix)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    #
    # # 读取Excel文件
    if(len(num_columns)==0):
        reader = pd.read_excel(input_path, sheet_name='Sheet1',dtype=str)
    else:
        reader = pd.read_excel(input_path, sheet_name='Sheet1', dtype=num_columns)

    num = 0

    for i in range(0,total_num,single_one):
        new_data = reader.iloc[i:i+single_one]
        new_data.to_excel(output_prefix+str(num)+'.xlsx', index=False)

        print(f"文件分割完成!生成 {output_prefix+str(num)+'.xlsx'}")
        num += 1

    print("分割完成!!!")
# 使用示例
if __name__ == "__main__":
    # total_num = 405068
    total_num = 80535  # 总行数
    single_one = 30000 # 单个行数

    input_file = "/Users/zhaotongxue/Downloads/sss.xlsx"  # 输入的大文件路径

    output_prefix = "/Users/zhaotongxue/Desktop/ss/前缀"  # 输出文件前缀(包含路径)
    # num_columns = {"卡号":str}
    num_columns = {}
    split_excel(input_file, output_prefix,total_num,single_one,num_columns)  # 每5万行分割一次

2.python检查excel输出每列最大长度[目录下所有文件 or 目录下每个文件]

"""
@Project :pythonProject
@File    :fileUtils.py
@IDE     :PyCharm
@Author  :alice
@Date    :2024/7/18 10:10
"""
import os

import openpyxl
import pandas as pd


class FileUtils:
    def __init__(self, path):
        self.path = path

    # 遍历路径下的所有非隐藏文件
    def get_files(self):
        files_path = []
        for root, dirs, files in os.walk(self.path):
            files = [file for file in files if not file.startswith(".")]
            for file in files:
                files_path.append(os.path.join(root, file))
        return files_path

    def showMaxLen(self, dic, skp,sheet_name,type):
        global data
        print("文件名称:"+dic)
        if type == "xlsx":
            data = pd.read_excel(dic,sheet_name=sheet_name, skiprows=skp)
        if type == "csv":
            data = pd.read_csv(dic,skiprows=skp)
        for i, col in enumerate(data.columns.tolist()):
            data[col] = data[col].astype(str)
            n_max = data[col].str.len().max()
            print(i, col, n_max)

    def showMaxLen2(self, dic, skp,type):
        num = 0
        cols = {}
        for paths in dic:
            num = num + 1
            if type=="xlsx":
                data = pd.read_excel(paths, skiprows=skp)
            elif type=="csv":
                data = pd.read_csv(paths, skiprows=skp)

            for i, col in enumerate(data.columns.tolist()):
                data[col] = data[col].astype(str)
                n_max = data[col].str.len().max()
                if cols.get(col) is not None and cols.get(col) < n_max:
                    # print(col, cols[col], n_max)
                    cols.update({col: n_max})
                elif cols.get(col) is None:
                    cols.update({col: n_max})
        print("总文件数:{}".format(num))
        return cols




if __name__ == '__main__':
   #遍历目录下所有文件,获取列的最大值
   path = "/Users/zhaotongxue/Desktop/黑龙江"
   fileUtils = FileUtils(path)
   files = fileUtils.get_files()
   columns = fileUtils.showMaxLen2(files,skp=0,type="xlsx")
   print(columns)
   #遍历目录下所有文件,获取每个列的最大值
   for file in files:
    fileUtils.showMaxLen(file,skp=0,sheet_name="Sheet1",type="xlsx")



你可能感兴趣的:(python,excel,开发语言)