Merge branch 'develop'

CoderWanFeng · Sep 9, 2022 · 53fd04e · 53fd04e
2 parents 981b6fe + 6942145
commit 53fd04e
Show file tree

Hide file tree

Showing 38 changed files with 432 additions and 108 deletions.
diff --git a/contributors/bulabean/SEdemo.xlsx b/contributors/bulabean/SEdemo.xlsx
diff --git a/contributors/bulabean/SEdemo_Split_2022-09-08_162027.xlsx b/contributors/bulabean/SEdemo_Split_2022-09-08_162027.xlsx
diff --git a/contributors/bulabean/SEdemo_Split_2022-09-08_162113.xlsx b/contributors/bulabean/SEdemo_Split_2022-09-08_162113.xlsx
diff --git a/contributors/bulabean/SearchExcel.py b/contributors/bulabean/SearchExcel.py
@@ -0,0 +1,138 @@
+import os
+import openpyxl
+import xlrd
+import datetime
+import time
+
+
+def change_datatype(row_data: list):
+    """
+    excel单元格的内容类型检测和转换
+    参数：
+        row_data：行数据，列表格式
+    """
+    result_data = []
+    for rd in row_data:
+        if type(rd) == datetime.datetime:
+            t = rd.strftime("%Y-%m-%d %H:%M:%S")
+        elif type(rd) == str:
+            t = rd
+        elif type(rd) == int:
+            t = str(rd)
+        elif type(rd) == float:
+            t = str(rd)
+        elif type(rd) is None:
+            t = ''
+        else:
+            t = str(rd)
+        result_data.append(t)
+    return result_data
+
+
+def find_key(search_key: str, row_content: str):
+    """
+    检测关键词和内容
+    参数：
+        search_key：关键词
+        row_content：行内容
+    """
+    if search_key in row_content:
+        return True
+    else:
+        return False
+
+
+def process_xls(path, file):
+    """
+    读取xls后缀的excel文件
+    参数：
+        path：文件所在路径
+        file：文件名
+    """
+    filepath = os.path.join(path, file)
+    try:
+        rb = xlrd.open_workbook(filepath, formatting_info=True)
+    except:
+        return False
+    sheet_names = rb.sheet_names()
+    space_line = 0
+    for ws_name in sheet_names:
+        ws = rb.sheet_by_name(ws_name)
+        rows = ws.nrows
+        cols = ws.ncols
+        for r in range(rows):
+            values = [ws.cell(r, c).value for c in range(cols)]
+            values = change_datatype(values)
+            values = " ".join(values)
+            if values:
+                yield filepath, ws_name, r, values  # 文件路径，工作表名，行数，行内容
+            else:
+                if space_line < 10:
+                    space_line += 1
+                else:
+                    break
+
+
+def process_xlsx(path, file):
+    """
+    读取xlsx后缀的excel文件
+    参数：
+        path：文件所在路径
+        file：文件名
+    """
+    filepath = os.path.join(path, file)
+    try:
+        wb = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
+    except:
+        return False
+    worksheets_name = wb.sheetnames
+    space_line = 0
+    for ws_name in worksheets_name:
+        ws = wb[ws_name]
+        for index, row in enumerate(ws.rows):
+            values = [r.value for r in row if r.value != None]
+            values = change_datatype(values)
+            values = " ".join(values)
+            if values:
+                yield filepath, ws_name, index, values  # 文件路径，工作表名，行数，行内容
+            else:
+                if space_line < 10:
+                    space_line += 1
+                else:
+                    break
+
+
+def find_excel_data(search_key: str, target_dir: str):
+    """
+    检索指定目录下的excel文件和过滤
+    参数：
+        search_key：检索的关键词
+        target_dir：目标文件夹
+    """
+    for path, dirs, files in os.walk(target_dir):
+        files = [file for file in files if not file.startswith('~$')]  # 过滤掉正打开的excel文件
+        xls_files = [file for file in files if file.endswith('.xls')]  # 取出所有的xls后缀文件
+        xlsx_files = [file for file in files if file.endswith('.xlsx')]  # 取出所有的xlsx后缀文件
+        for xls in xls_files:
+            for data in process_xls(path, xls):
+                filepath, ws_name, index, values = data
+                status = find_key(search_key, values)
+                if status:
+                    yield filepath, ws_name, index, values
+        for xlsx in xlsx_files:
+            for data in process_xlsx(path, xlsx):
+                filepath, ws_name, index, values = data
+                status = find_key(search_key, values)
+                if status:
+                    yield filepath, ws_name, index, values  # 输出内容：路径/文件名、工作表名、行数、行内容
+
+
+if __name__ == '__main__':
+
+    time1 = time.time()
+    search_key = '刘家站垦殖场'
+    target_dir = './'
+    for data in find_excel_data(search_key, target_dir):
+        print(list(data))
+    time2 = time.time()
+    print("\n程序运行结束，停止运行。耗时：{}秒".format(round(time2 - time1, 2)))
diff --git a/contributors/bulabean/SplitExcel.py b/contributors/bulabean/SplitExcel.py
@@ -0,0 +1,93 @@
+import os
+import xlrd, xlwt
+import openpyxl
+import datetime
+
+
+#
+
+def generate_xls(filepath: str, worksheet_data: dict):
+    datetime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
+    new_filepath = filepath.replace('.xls', '_Split_{}.xls'.format(datetime_str))
+    new_workbook = xlwt.Workbook(encoding='utf-8')
+    for worksheet_name, row_data_list in worksheet_data.items():
+        new_worksheet = new_workbook.add_sheet(worksheet_name)
+        for row_index, row_data in enumerate(row_data_list):
+            for column_index, data in enumerate(row_data):
+                new_worksheet.write(row_index, column_index, data)
+    new_workbook.save(new_filepath)
+    return new_filepath
+
+
+def process_xls(filepath, column: int, worksheet_name: str = None):
+    try:
+        workbook = xlrd.open_workbook(filepath, formatting_info=True)
+    except:
+        return "文件读取异常：{}".format(filepath)
+    if worksheet_name:
+        worksheet = workbook.sheet_by_name(worksheet_name)
+    else:
+        worksheet = workbook.sheet_by_index(0)
+    rows = worksheet.nrows
+    cols = worksheet.ncols
+    split_data_dict = {}
+    for r in range(rows):
+        row_data = [worksheet.cell(r, c).value if worksheet.cell(r, c).value else ' ' for c in range(cols)]
+        temp_data = row_data[column - 1]
+        temp_data_list = split_data_dict.get(temp_data, [])
+        temp_data_list.append(row_data)
+        split_data_dict[temp_data] = temp_data_list
+    new_filepath = generate_xls(filepath, split_data_dict)
+    return "数据保存在新文件中，文件名：{}".format(new_filepath)
+
+
+def generate_xlsx(filepath: str, worksheet_data: dict):
+    datetime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
+    new_filepath = filepath.replace('.xlsx', '_Split_{}.xlsx'.format(datetime_str))
+    new_workbook = openpyxl.Workbook()
+    for worksheet_name, row_data_list in worksheet_data.items():
+        new_worksheet = new_workbook.create_sheet(worksheet_name)
+        for row_data in row_data_list:
+            new_worksheet.append(row_data)
+    new_workbook.save(new_filepath)
+    return new_filepath
+
+
+def process_xlsx(filepath: str, column: int, worksheet_name: str = None):
+    try:
+        workbook = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
+    except:
+        return "文件读取异常：{}".format(filepath)
+    if worksheet_name:
+        worksheet = workbook.get_sheet_by_name(worksheet_name)
+    else:
+        worksheet = workbook.active
+    if worksheet.max_column < column:
+        return "最大列数是{}，取不到第{}列".format(worksheet.max_column, column)
+
+    split_data_dict = {}
+    for row in worksheet.rows:
+        row_data = [cell.value if cell.value else ' ' for cell in row]
+        temp_data = row_data[column - 1]
+        temp_data_list = split_data_dict.get(temp_data, [])
+        temp_data_list.append(row_data)
+        split_data_dict[temp_data] = temp_data_list
+    new_filepath = generate_xlsx(filepath, split_data_dict)
+    return "数据保存在新文件中，文件名：{}".format(new_filepath)
+
+
+def split_excel(filepath: str, column: int, worksheet_name: str = None):
+    if filepath.endswith('.xlsx'):
+        result = process_xlsx(filepath, column, worksheet_name)
+    elif filepath.endswith('.xls'):
+        result = process_xls(filepath, column, worksheet_name)
+    else:
+        return "文件格式不对，不进行处理"
+    return result
+
+
+if __name__ == "__main__":
+    filename = 'sedemo.xls'
+    # filename = 'SEdemo.xlsx'
+    result = split_excel(filename, 6)  # 处理文件，表格的第六列，worksheet_name指定工作表，不指定则读取文件默认工作表
+    print(result)
diff --git a/contributors/bulabean/sedemo.xls b/contributors/bulabean/sedemo.xls
diff --git a/contributors/bulabean/sedemo_Split_2022-08-23_203011.xls b/contributors/bulabean/sedemo_Split_2022-08-23_203011.xls
diff --git a/contributors/bulabean/sedemo_Split_2022-08-23_203413.xls b/contributors/bulabean/sedemo_Split_2022-08-23_203413.xls
diff --git a/office/core/ImageType.py b/office/core/ImageType.py
@@ -11,7 +11,7 @@
 
 class MainImage():
 
-    # 自动生成gif
+    #TODO:自动生成gif
     def image2gif(self):
         im = Image.open("1.jpg")
         images = []

diff --git a/office/lib/pdf/add_watermark_service.py b/office/lib/pdf/add_watermark_service.py
@@ -4,6 +4,7 @@
 from reportlab.pdfgen import canvas
 from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfbase.pdfmetrics import registerFont
+from tqdm import tqdm
 
 def create_watermark(content):
     """创建PDF水印模板

diff --git a/setup.py b/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-#-*- coding:utf-8 -*-
+# -*- coding:utf-8 -*-
 
 #############################################
 # File Name: setup.py
@@ -9,6 +9,6 @@
 # Description: https://mp.weixin.qq.com/s/zzD4pxNMFd0ZuWqXlVFdAg
 #############################################
 
-from setuptools import setup            #这个包没有的可以pip一下
+from setuptools import setup  # 这个包没有的可以pip一下
 
 setup()
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1 +1,5 @@
-# pip install python-office -i https://pypi.python.org/simple -U
+# pip install python-office -i https://pypi.python.org/simple -U
+
+# 1、pip freeze > allpackages.txt
+# 2、pip uninstall -r allpackages.txt -y
+# 3、pip install --upgrade python-office
diff --git a/tests/add_img.pdf b/tests/add_img.pdf
diff --git a/tests/test_excel.py b/tests/test_excel.py
diff --git a/tests/test_file.py b/tests/test_file.py
diff --git a/tests/test_dev.py → ...s/file/add_fix/2插入内容测试022app_fix添加后缀测试.py b/tests/test_dev.py → ...s/file/add_fix/2插入内容测试022app_fix添加后缀测试.py
diff --git a/tests/test_files/md/test.txt b/tests/test_files/md/test.txt
diff --git a/tests/test_files/pdf/encrypt.pdf b/tests/test_files/pdf/encrypt.pdf
diff --git a/tests/test_files/pdf/images_0.png b/tests/test_files/pdf/images_0.png
diff --git a/tests/test_files/pdf/in.docx b/tests/test_files/pdf/in.docx
diff --git a/tests/test_files/ppt/test_pdf.pptx b/tests/test_files/ppt/test_pdf.pptx
diff --git a/tests/test_files/ruiming/test.txt b/tests/test_files/ruiming/test.txt
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -0,0 +1,9 @@
+import unittest
+
+if __name__ == '__main__':
+    test_dir = './test_unit'
+    suite = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py')
+    # 创建测试runner，执行测试用例集
+    with open('test_result.txt', 'w+') as f:
+        runner = unittest.TextTestRunner(stream=f, verbosity=2)
+        runner.run(suite)
diff --git a/tests/test_pdf.py b/tests/test_pdf.py
diff --git a/tests/test_unit/__init__.py b/tests/test_unit/__init__.py
diff --git a/tests/test_unit/output/0816.jpg b/tests/test_unit/output/0816.jpg
diff --git a/tests/test_unit/test_dev.py b/tests/test_unit/test_dev.py