从office2007开始,微软采用ZIP压缩技术来存储文档Office Open XML格式。例如 word采用的docx就是一个zip压缩包,里面保存了以xml为主的文件。
读取docx文件示例如下:
from zipfile import ZipFile
from bs4 import BeautifulSoup
#1 使用zip解压docx,解析 document.xml 提取所有页docx文字」
def ReadDocxXmlContent(srcDocxFile ):
text = ""
with ZipFile(srcDocxFile) as zf:
xmldoc= zf.read('word/document.xml').decode()
soup = BeautifulSoup(xmldoc, 'xml')
#主体节点
phaseList = soup.find('w:body')
for child in phaseList.children:
#1 处理表格内容
if child.name == 'tbl' :
for sub_child in child.children:
if sub_child.name == 'tr':
item_tr = sub_child.find_all('w:t' )
temptxt = ''
for sub_item in item_tr:
temptxt =temptxt+ sub_item.getText()
text = text + temptxt
text = text +'n'
#2 处理普通段落
#if child.name == 'p':
else:
item_tr = child.find_all('w:t' )
temptxt = ''
for sub_item in item_tr:
temptxt =temptxt+ sub_item.getText()
text = text + temptxt
text = text +'n'
return text