Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
CoderWanFeng committed Sep 9, 2022
2 parents 981b6fe + 6942145 commit 53fd04e
Show file tree
Hide file tree
Showing 38 changed files with 432 additions and 108 deletions.
Binary file added contributors/bulabean/SEdemo.xlsx
Binary file not shown.
Binary file not shown.
Binary file not shown.
138 changes: 138 additions & 0 deletions contributors/bulabean/SearchExcel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import openpyxl
import xlrd
import datetime
import time


def change_datatype(row_data: list):
"""
excel单元格的内容类型检测和转换
参数:
row_data:行数据,列表格式
"""
result_data = []
for rd in row_data:
if type(rd) == datetime.datetime:
t = rd.strftime("%Y-%m-%d %H:%M:%S")
elif type(rd) == str:
t = rd
elif type(rd) == int:
t = str(rd)
elif type(rd) == float:
t = str(rd)
elif type(rd) is None:
t = ''
else:
t = str(rd)
result_data.append(t)
return result_data


def find_key(search_key: str, row_content: str):
"""
检测关键词和内容
参数:
search_key:关键词
row_content:行内容
"""
if search_key in row_content:
return True
else:
return False


def process_xls(path, file):
"""
读取xls后缀的excel文件
参数:
path:文件所在路径
file:文件名
"""
filepath = os.path.join(path, file)
try:
rb = xlrd.open_workbook(filepath, formatting_info=True)
except:
return False
sheet_names = rb.sheet_names()
space_line = 0
for ws_name in sheet_names:
ws = rb.sheet_by_name(ws_name)
rows = ws.nrows
cols = ws.ncols
for r in range(rows):
values = [ws.cell(r, c).value for c in range(cols)]
values = change_datatype(values)
values = " ".join(values)
if values:
yield filepath, ws_name, r, values # 文件路径,工作表名,行数,行内容
else:
if space_line < 10:
space_line += 1
else:
break


def process_xlsx(path, file):
"""
读取xlsx后缀的excel文件
参数:
path:文件所在路径
file:文件名
"""
filepath = os.path.join(path, file)
try:
wb = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
except:
return False
worksheets_name = wb.sheetnames
space_line = 0
for ws_name in worksheets_name:
ws = wb[ws_name]
for index, row in enumerate(ws.rows):
values = [r.value for r in row if r.value != None]
values = change_datatype(values)
values = " ".join(values)
if values:
yield filepath, ws_name, index, values # 文件路径,工作表名,行数,行内容
else:
if space_line < 10:
space_line += 1
else:
break


def find_excel_data(search_key: str, target_dir: str):
"""
检索指定目录下的excel文件和过滤
参数:
search_key:检索的关键词
target_dir:目标文件夹
"""
for path, dirs, files in os.walk(target_dir):
files = [file for file in files if not file.startswith('~$')] # 过滤掉正打开的excel文件
xls_files = [file for file in files if file.endswith('.xls')] # 取出所有的xls后缀文件
xlsx_files = [file for file in files if file.endswith('.xlsx')] # 取出所有的xlsx后缀文件
for xls in xls_files:
for data in process_xls(path, xls):
filepath, ws_name, index, values = data
status = find_key(search_key, values)
if status:
yield filepath, ws_name, index, values
for xlsx in xlsx_files:
for data in process_xlsx(path, xlsx):
filepath, ws_name, index, values = data
status = find_key(search_key, values)
if status:
yield filepath, ws_name, index, values # 输出内容:路径/文件名、工作表名、行数、行内容


if __name__ == '__main__':

time1 = time.time()
search_key = '刘家站垦殖场'
target_dir = './'
for data in find_excel_data(search_key, target_dir):
print(list(data))
time2 = time.time()
print("\n程序运行结束,停止运行。耗时:{}秒".format(round(time2 - time1, 2)))
93 changes: 93 additions & 0 deletions contributors/bulabean/SplitExcel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import os
import xlrd, xlwt
import openpyxl
import datetime


#

def generate_xls(filepath: str, worksheet_data: dict):
datetime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
new_filepath = filepath.replace('.xls', '_Split_{}.xls'.format(datetime_str))
new_workbook = xlwt.Workbook(encoding='utf-8')
for worksheet_name, row_data_list in worksheet_data.items():
new_worksheet = new_workbook.add_sheet(worksheet_name)
for row_index, row_data in enumerate(row_data_list):
for column_index, data in enumerate(row_data):
new_worksheet.write(row_index, column_index, data)
new_workbook.save(new_filepath)
return new_filepath


def process_xls(filepath, column: int, worksheet_name: str = None):
try:
workbook = xlrd.open_workbook(filepath, formatting_info=True)
except:
return "文件读取异常:{}".format(filepath)
if worksheet_name:
worksheet = workbook.sheet_by_name(worksheet_name)
else:
worksheet = workbook.sheet_by_index(0)
rows = worksheet.nrows
cols = worksheet.ncols
split_data_dict = {}
for r in range(rows):
row_data = [worksheet.cell(r, c).value if worksheet.cell(r, c).value else ' ' for c in range(cols)]
temp_data = row_data[column - 1]
temp_data_list = split_data_dict.get(temp_data, [])
temp_data_list.append(row_data)
split_data_dict[temp_data] = temp_data_list
new_filepath = generate_xls(filepath, split_data_dict)
return "数据保存在新文件中,文件名:{}".format(new_filepath)


def generate_xlsx(filepath: str, worksheet_data: dict):
datetime_str = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')
new_filepath = filepath.replace('.xlsx', '_Split_{}.xlsx'.format(datetime_str))
new_workbook = openpyxl.Workbook()
for worksheet_name, row_data_list in worksheet_data.items():
new_worksheet = new_workbook.create_sheet(worksheet_name)
for row_data in row_data_list:
new_worksheet.append(row_data)
new_workbook.save(new_filepath)
return new_filepath


def process_xlsx(filepath: str, column: int, worksheet_name: str = None):
try:
workbook = openpyxl.load_workbook(filepath, read_only=True, data_only=True)
except:
return "文件读取异常:{}".format(filepath)
if worksheet_name:
worksheet = workbook.get_sheet_by_name(worksheet_name)
else:
worksheet = workbook.active
if worksheet.max_column < column:
return "最大列数是{},取不到第{}列".format(worksheet.max_column, column)

split_data_dict = {}
for row in worksheet.rows:
row_data = [cell.value if cell.value else ' ' for cell in row]
temp_data = row_data[column - 1]
temp_data_list = split_data_dict.get(temp_data, [])
temp_data_list.append(row_data)
split_data_dict[temp_data] = temp_data_list
new_filepath = generate_xlsx(filepath, split_data_dict)
return "数据保存在新文件中,文件名:{}".format(new_filepath)


def split_excel(filepath: str, column: int, worksheet_name: str = None):
if filepath.endswith('.xlsx'):
result = process_xlsx(filepath, column, worksheet_name)
elif filepath.endswith('.xls'):
result = process_xls(filepath, column, worksheet_name)
else:
return "文件格式不对,不进行处理"
return result


if __name__ == "__main__":
filename = 'sedemo.xls'
# filename = 'SEdemo.xlsx'
result = split_excel(filename, 6) # 处理文件,表格的第六列,worksheet_name指定工作表,不指定则读取文件默认工作表
print(result)
Binary file added contributors/bulabean/sedemo.xls
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion office/core/ImageType.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

class MainImage():

# 自动生成gif
#TODO:自动生成gif
def image2gif(self):
im = Image.open("1.jpg")
images = []
Expand Down
1 change: 1 addition & 0 deletions office/lib/pdf/add_watermark_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from reportlab.pdfgen import canvas
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase.pdfmetrics import registerFont
from tqdm import tqdm

def create_watermark(content):
"""创建PDF水印模板
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# -*- coding:utf-8 -*-

#############################################
# File Name: setup.py
Expand All @@ -9,6 +9,6 @@
# Description: https://mp.weixin.qq.com/s/zzD4pxNMFd0ZuWqXlVFdAg
#############################################

from setuptools import setup #这个包没有的可以pip一下
from setuptools import setup # 这个包没有的可以pip一下

setup()
6 changes: 5 additions & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
# pip install python-office -i https://pypi.python.org/simple -U
# pip install python-office -i https://pypi.python.org/simple -U

# 1、pip freeze > allpackages.txt
# 2、pip uninstall -r allpackages.txt -y
# 3、pip install --upgrade python-office
Binary file removed tests/add_img.pdf
Binary file not shown.
54 changes: 0 additions & 54 deletions tests/test_excel.py

This file was deleted.

23 changes: 0 additions & 23 deletions tests/test_file.py

This file was deleted.

File renamed without changes.
Empty file added tests/test_files/md/test.txt
Empty file.
Binary file added tests/test_files/pdf/encrypt.pdf
Binary file not shown.
Binary file added tests/test_files/pdf/images_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/test_files/pdf/in.docx
Binary file not shown.
Binary file added tests/test_files/ppt/test_pdf.pptx
Binary file not shown.
Empty file.
9 changes: 9 additions & 0 deletions tests/test_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import unittest

if __name__ == '__main__':
test_dir = './test_unit'
suite = unittest.defaultTestLoader.discover(test_dir, pattern='test_*.py')
# 创建测试runner,执行测试用例集
with open('test_result.txt', 'w+') as f:
runner = unittest.TextTestRunner(stream=f, verbosity=2)
runner.run(suite)
25 changes: 0 additions & 25 deletions tests/test_pdf.py

This file was deleted.

Empty file added tests/test_unit/__init__.py
Empty file.
Binary file added tests/test_unit/output/0816.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added tests/test_unit/test_dev.py
Empty file.
Loading

0 comments on commit 53fd04e

Please sign in to comment.