一、原来使用spire.doc库,代码很简单就能实现,但是转换后有警告水印,需要开通会员才行,代码如下:
from spire.doc import *
from spire.doc.common import *
from PIL import Image
import io
inputFile = "D:/Temp/测试.docx"
# 创建Document类的对象
doc = Document()
# 载入Word文档
doc.LoadFromFile(inputFile)
# 将文档每页转换为位图并返回列表
imageStream = doc.SaveImageToStreams(ImageType.Metafile)
# 使用Pillow将图像数据合并为一张长图
images = []
for image in imageStream:
img = Image.open(io.BytesIO(image.ToArray()))
images.append(img)
combined_image = Image.new("RGB", (images[0].width, sum(img.height for img in images)))
y_offset = 0
for img in images:
combined_image.paste(img, (0, y_offset))
y_offset += img.height
# 保存合并后的图像为一张长图
combined_image.save("D:/Temp/output/combined_image.jpg", "JPEG")
doc.Close()
二、现在转换思路如下:
1、先将Word文档转为pdf文件。
2、读取pdf文件,逐页转为图片,保存为多个临时文件。
3、读取临时图片文件,拼接输出一张图片文件。
4、对长图中过多的空白行进行删除。
说明:
1、word文件转pdf目前用的是win32com库,只能在windows系统使用。
2、空白行的判断效率较低。
三、步骤:
1. word转pdf
# -*- coding:utf-8 -*-
"""
将word文档转换为pdf文件
"""
from datetime import datetime
from pathlib import Path
import win32com.client
# 将Word文档转换为PDF文件
def convert_to_pdf(input_file_path, output_file_path):
# 目标文件若已存在,则先删除
Path(output_file_path).unlink(True)
word = win32com.client.Dispatch("Word.Application")
try:
doc = word.Documents.Open(input_file_path)
doc.SaveAs2(output_file_path, FileFormat=17)
doc.Close()
except Exception as e:
print("转pdf失败:%s" % e)
finally:
word.Quit()
def word_2_pdf(word_name, new_pdf_name):
word_path = Path(word_name).parent
convert_to_pdf(word_name, new_pdf_name)
return new_pdf_name
if __name__ == "__main__":
word_name = "D:/Temp/测试.docx"
new_pdf_name = "D:/Temp/测试.pdf"
word_2_pdf(word_name,new_pdf_name)
2. pdf转图片
# -*- coding:utf-8 -*-
from datetime import datetime
from pathlib import Path
# 安装fitz 就是安装 PyMuPDF 才能使用
import fitz
# import os
# 安装 opencv, opencv的像素含义顺序是 BGR (不是常用的RGB)
# pip3 install opencv-python -i http://pypi.douban.com/simple --trusted-host pypi.douban.com
import cv2
import numpy as np
from shutil import copyfile
def pdf_2_png(pdf_name,png_name=None):
print(pdf_name)
pdf_path = Path(pdf_name).parent
doc = fitz.open(pdf_name)
img_stack = None
temp = 0
# 每页pdf生产一个临时图片
for pg in range(doc.page_count):
page = doc[pg]
temp += 1
rotate = int(0)
# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
zoom_x = 2.0
zoom_y = 2.0
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pixmap = page.get_pixmap(matrix=trans, alpha=False)
# 生成临时png文件路径
pic_name = str(pdf_path.joinpath('_temp_{}.png'.format(temp)).absolute())
pixmap.save(pic_name)
# pm_img = cv2.imread(pic_name) # 此方式不支持中文目录,改用下方方法
pm_img = cv2.imdecode(np.fromfile(pic_name, dtype=np.uint8), cv2.IMREAD_COLOR + cv2.IMREAD_IGNORE_ORIENTATION)
pm_img = cv2.resize(pm_img, (1191, 1684))
# 删除临时图片文件
Path(pic_name).unlink(True)
# 拼长图
if img_stack is None:
img_stack = np.vstack((pm_img,))
else:
img_stack = np.vstack((img_stack, pm_img))
# 删除长图中的空白区域
thin_img = shrink_img(img_stack, 100, 20)
output_file = png_name if png_name is not None else str(pdf_path.joinpath(Path(pdf_name).stem + ".png").absolute())
# cv2.imwrite(str(tmp_img_name.absolute()), thin_img) # 不支持中文目录
# 采用下述方法保存到带中文的目录
cv2.imencode('.png', thin_img)[1].tofile(output_file)
def shrink_file(img_file,target_file):
pm_img = cv2.imread(img_file)
im = shrink_img(pm_img, 120, 20)
cv2.imwrite(target_file, im)
if __name__ == "__main__":
pf = "D:/Temp/测试.pdf"
pdf_2_png(pf)
3. 图片空白行删除
def is_blank(line):
"""
判断本行是否空白行
"""
for pixel in line:
if not all(n == 255 for n in pixel):
return False
return True
def get_blank_block(img, begin_row, end_row, need_height):
"""
获取高度大于等于输入值的整块空白区域
"""
if (img is None) or (begin_row < 0) or (end_row < begin_row) or (need_height <= 0):
return False, 0, 0
if (end_row - begin_row) < need_height:
return False, 0, 0
start_row = -1
found = False
found_height = 0
for row in range(begin_row, end_row):
line = img[row, :]
if not is_blank(line):
# 非空白,则判断高度是否符合
if found_height >= need_height:
break
start_row = -1
found_height = 0
continue
# 是空白行
if start_row < 0:
start_row = row
found_height += 1
if found_height >= need_height:
found = True
return found, start_row, found_height
def shrink_img(img, blank_height=50, reserve_height=20):
"""
将图片中过长的空白背景截取删除:对于图片中整行都是白色,且超过一定高度的,仅保留指定高度区域,其余删除。
"""
# 读取原始图片宽高
height, width = img.shape[:2]
found = True
img_stack = None
begin_row = 0
while found:
found, begin_blank_row, found_height = get_blank_block(img, begin_row, height, blank_height)
if found:
# 找到空白区域,将搜索起始行到空白起始行之间的图像加入stack,跳过空白区域,继续搜索
img2 = img[begin_row: begin_blank_row + reserve_height, :]
begin_row = begin_blank_row + found_height
else:
# 没找到空白区域,将搜索起始行到结束行的图像加入stack
img2 = img[begin_row:height, :]
if img_stack is None:
img_stack = np.vstack((img2,))
else:
img_stack = np.vstack((img_stack, img2))
if img_stack is None:
img_stack = img
return img_stack
if __name__ == "__main__":
img_info = cv2.imread("d:/test.png")
thin_img = shrink_img(img_info, 60, 20)
cv2.imshow('result', thin_img)
cv2.waitKey(0)
cv2.destroyAllWindows()
备注:需要安装如下
pip3 install pypiwin32 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
::pip3 install fitz -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip3 install PyMuPDF -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip3 install --upgrade opencv-python -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
fitz实际上是安装PyMuPDF,直接安装非常麻烦,将会出现下面的问题。
安装第二个出现Microsoft Visual C++ 14.0 or greater is required的解决方法:链接地址