python使用python-docx读取word内容 电脑版发表于:2025/5/27 9:42 [TOC] ### 先安装依赖 ``` pip install python-docx ``` ### 段落内容和表格内容分开读取 ``` import docx from docx.document import Document from docx.table import Table from docx.text.paragraph import Paragraph def read_word_document(file_path: str) -> None: """读取 Word 文档并输出其内容""" try: file_path = "words/222.docx" # 打开文档 doc: Document = docx.Document(file_path) # 读取所有段落 print("文档段落内容:") for i, paragraph in enumerate(doc.paragraphs, 1): if paragraph.text.strip(): # 跳过空段落 print(f"段落 {i}: {paragraph.text}") # 读取所有表格 print("\n文档表格内容:") for table_idx, table in enumerate(doc.tables, 1): print(f"\n表格 {table_idx}:") for row_idx, row in enumerate(table.rows): row_data = [cell.text for cell in row.cells] print(f"行 {row_idx + 1}: {row_data}") except FileNotFoundError: print(f"错误:找不到文件 '{file_path}'") except Exception as e: print(f"错误:读取文件时发生异常 - {e}") if __name__ == "__main__": # 请替换为实际的 Word 文档路径 file_path = "example.docx" read_word_document(file_path) ``` ### 一次性读取完 ``` import docx from docx.document import Document from docx.oxml.ns import qn def read_word_document(file_path: str) -> None: """读取 Word 文档并按顺序输出所有内容""" try: doc: Document = docx.Document(file_path) content = [] # 遍历文档中的所有内容块(段落和表格) for element in doc.element.body: # 判断元素类型 if element.tag.endswith('p'): # 段落 paragraph = docx.text.paragraph.Paragraph(element, doc) if paragraph.text.strip(): content.append(f"[段落] {paragraph.text}") elif element.tag.endswith('tbl'): # 表格 table = docx.table.Table(element, doc) table_content = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_content.append(" | ".join(row_data)) content.append(f"[表格]\n" + "\n".join(table_content)) # 输出所有内容 print("\n".join(content)) except FileNotFoundError: print(f"错误:找不到文件 '{file_path}'") except Exception as e: print(f"错误:读取文件时发生异常 - {e}") if __name__ == "__main__": # 使用相对路径指定文件位置 file_path = "words/222.docx" read_word_document(file_path) ``` ### 读取网络中的word内容 ``` import docx import requests from io import BytesIO def read_word_document_from_url(url: str) -> None: """从 URL 读取 Word 文档并按顺序输出所有内容""" try: # 发送 HTTP 请求获取文档内容 response = requests.get(url) response.raise_for_status() # 检查请求是否成功 # 将响应内容转换为二进制流 doc_bytes = BytesIO(response.content) # 打开文档 doc = docx.Document(doc_bytes) content = [] # 遍历文档中的所有内容块(段落和表格) for element in doc.element.body: if element.tag.endswith('p'): # 段落 paragraph = docx.text.paragraph.Paragraph(element, doc) if paragraph.text.strip(): content.append(f"[段落] {paragraph.text}") elif element.tag.endswith('tbl'): # 表格 table = docx.table.Table(element, doc) table_content = [] for row in table.rows: row_data = [cell.text for cell in row.cells] table_content.append(" | ".join(row_data)) content.append(f"[表格]\n" + "\n".join(table_content)) # 输出所有内容 print("\n".join(content)) except requests.exceptions.RequestException as e: print(f"HTTP 请求错误: {e}") except Exception as e: print(f"错误: 处理文档时发生异常 - {e}") if __name__ == "__main__": # 指定网络 Word 文档的 URL url = "http://watertapcollection.cqzuxia.com/ImportTemplate/222.docx" read_word_document_from_url(url) ``` ### 读取word报错: 错误:读取文件时发生异常 - module 'docx' has no attribute 'table' #### 直接读取文件的版本 ``` import docx from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.text.paragraph import Paragraph import argparse def read_word_document(file_path: str) -> None: """读取Word文档并按顺序输出所有内容""" try: # 打开本地文档 doc = docx.Document(file_path) content = [] # 遍历文档中的所有内容块(段落和表格) for element in doc.element.body: if isinstance(element, CT_P): # 段落 paragraph = Paragraph(element, doc) if paragraph.text.strip(): content.append(f"[段落] {paragraph.text}") elif isinstance(element, CT_Tbl): # 表格 table_content = [] for row in element.tr_lst: row_data = [] for cell in row.tc_lst: cell_text = "" for paragraph in cell.p_lst: p = Paragraph(paragraph, doc) cell_text += p.text row_data.append(cell_text) table_content.append(" | ".join(row_data)) content.append(f"[表格]\n" + "\n".join(table_content)) # 输出所有内容 print("\n".join(content)) except FileNotFoundError: print(f"错误:找不到文件 '{file_path}'") except Exception as e: print(f"错误:处理文档时发生异常 - {e}") if __name__ == "__main__": # 设置命令行参数解析 parser = argparse.ArgumentParser(description='读取Word文档内容') parser.add_argument('--file', default='words/output.docx', help='Word文档路径(默认:words/output.docx)') args = parser.parse_args() # 读取指定路径的Word文档 read_word_document(args.file) ``` #### 读取网络中的地址版本 ``` import docx import requests from io import BytesIO def read_word_document_from_url(url: str) -> None: """从 URL 读取 Word 文档并按顺序输出所有内容""" try: # 发送 HTTP 请求获取文档内容 response = requests.get(url) response.raise_for_status() # 检查请求是否成功 # 将响应内容转换为二进制流 doc_bytes = BytesIO(response.content) # 打开文档 doc = docx.Document(doc_bytes) content = [] # 遍历文档中的所有内容块(段落和表格) for element in doc.element.body: if element.tag.endswith('p'): # 段落 # 正确获取段落对象的方式 paragraph = docx.text.paragraph.Paragraph(element, doc) if paragraph.text.strip(): content.append(f"[段落] {paragraph.text}") elif element.tag.endswith('tbl'): # 表格 # 正确获取表格对象的方式 table = docx.oxml.table.CT_Tbl(element) table_content = [] for row in table.tr_lst: row_data = [] for cell in row.tc_lst: cell_text = "".join([ p.text for p in docx.text.paragraph.Paragraph( cell.p_lst[0], doc ).runs ]) row_data.append(cell_text) table_content.append(" | ".join(row_data)) content.append(f"[表格]\n" + "\n".join(table_content)) # 输出所有内容 print("\n".join(content)) except requests.exceptions.RequestException as e: print(f"HTTP 请求错误: {e}") except Exception as e: print(f"错误: 处理文档时发生异常 - {e}") if __name__ == "__main__": # 指定网络 Word 文档的 URL url = "http://watertapcollection.xj.com/ImportTemplate/222.docx" url = "http://watertapcollection.xj.com/ImportTemplate/111.doc" read_word_document_from_url(url) ```