python-docx==1.1.0
,主要记录docx源码中docx.opc.phys_pkg, docx.opc.pkgreader, docx.opc.pkgwriter三个模块,前者主要描述OPC中的zip实现,后两者介绍了如何从物理文件中读取或者写入信息。ZIP读取的逻辑定义在docx.opc.phys_pkg模块中:
class _ZipPkgReader(PhysPkgReader):
"""Implements |PhysPkgReader| interface for a zip file OPC package."""
def __init__(self, pkg_file):
super(_ZipPkgReader, self).__init__()
self._zipf = ZipFile(pkg_file, "r")
def blob_for(self, pack_uri):
"""Return blob corresponding to `pack_uri`.
Raises |ValueError| if no matching member is present in zip archive.
"""
return self._zipf.read(pack_uri.membername)
def close(self):
"""Close the zip archive, releasing any resources it is using."""
self._zipf.close()
def rels_xml_for(self, source_uri):
"""Return rels item XML for source with `source_uri` or None if no rels item is
present."""
try:
rels_xml = self.blob_for(source_uri.rels_uri)
except KeyError:
rels_xml = None
return rels_xml
source_uri.rels_uri="_rels/.rels"
;如果实参为“/word/document.xml”, 则source_uri.rels_uri="word/_rels/document.xml.rels"
源码如下:
class _ZipPkgWriter(PhysPkgWriter):
"""Implements |PhysPkgWriter| interface for a zip file OPC package."""
def __init__(self, pkg_file):
super(_ZipPkgWriter, self).__init__()
self._zipf = ZipFile(pkg_file, "w", compression=ZIP_DEFLATED)
def close(self):
"""Close the zip archive, flushing any pending physical writes and releasing any
resources it's using."""
self._zipf.close()
def write(self, pack_uri, blob):
"""Write `blob` to this zip package with the membername corresponding to
`pack_uri`."""
self._zipf.writestr(pack_uri.membername, blob)
读取docx物理文件涉及序列化part对象、序列化关系集合对象。
包内文件“[Content_Types].xml”定义了包内子文件的默认、自定义类型。1由于实例化part对象必须指明part节点的类型,因此必须首先解析“[Content_Types].xml”文件。docx.opc.pkgreader模块中定义的content_types处理逻辑如下:
class _ContentTypeMap:
"""Value type providing dictionary semantics for looking up content type by part
name, e.g. ``content_type = cti['/ppt/presentation.xml']``."""
def __init__(self):
super(_ContentTypeMap, self).__init__()
self._overrides = CaseInsensitiveDict()
self._defaults = CaseInsensitiveDict()
@staticmethod
def from_xml(content_types_xml):
"""Return a new |_ContentTypeMap| instance populated with the contents of
`content_types_xml`."""
types_elm = parse_xml(content_types_xml)
ct_map = _ContentTypeMap()
for o in types_elm.overrides:
ct_map._add_override(o.partname, o.content_type)
for d in types_elm.defaults:
ct_map._add_default(d.extension, d.content_type)
return ct_map
获取了part的content_type与partname,序列化的part对象定义如下:
class _SerializedPart:
"""Value object for an OPC package part.
Provides access to the partname, content type, blob, and serialized relationships
for the part.
"""
def __init__(self, partname, content_type, reltype, blob, srels):
super(_SerializedPart, self).__init__()
self._partname = partname
self._content_type = content_type
self._reltype = reltype
self._blob = blob
self._srels = srels
单条的序列化关系对象逻辑定义如下:
class _SerializedRelationship:
"""Value object representing a serialized relationship in an OPC package.
Serialized, in this case, means any target part is referred to via its partname
rather than a direct link to an in-memory |Part| object.
"""
def __init__(self, baseURI, rel_elm):
super(_SerializedRelationship, self).__init__()
self._baseURI = baseURI
self._rId = rel_elm.rId
self._reltype = rel_elm.reltype
self._target_mode = rel_elm.target_mode
self._target_ref = rel_elm.target_ref
@property
def target_partname(self):
"""|PackURI| instance containing partname targeted by this relationship.
Raises ``ValueError`` on reference if target_mode is ``'External'``. Use
:attr:`target_mode` to check before referencing.
"""
if self.is_external:
msg = (
"target_partname attribute on Relationship is undefined w"
'here TargetMode == "External"'
)
raise ValueError(msg)
# lazy-load _target_partname attribute
if not hasattr(self, "_target_partname"):
self._target_partname = PackURI.from_rel_ref(self._baseURI, self.target_ref)
return self._target_partname
序列化关系集合定义如下:
class _SerializedRelationships:
"""Read-only sequence of |_SerializedRelationship| instances corresponding to the
relationships item XML passed to constructor."""
def __init__(self):
super(_SerializedRelationships, self).__init__()
self._srels = []
@staticmethod
def load_from_xml(baseURI, rels_item_xml):
"""Return |_SerializedRelationships| instance loaded with the relationships
contained in `rels_item_xml`.
Returns an empty collection if `rels_item_xml` is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
到目前为止,我们还只获取到一个个零散的序列化part对象或者序列化关系集合,如何整合这些对象形成一个OPC标准的物理包——本质是一个图数据结构?从docx物理文件创建物理包对象的逻辑定义与PackageReader类:
class PackageReader:
"""Provides access to the contents of a zip-format OPC package via its
:attr:`serialized_parts` and :attr:`pkg_srels` attributes."""
def __init__(self, content_types, pkg_srels, sparts):
super(PackageReader, self).__init__()
self._pkg_srels = pkg_srels
self._sparts = sparts
@staticmethod
def from_file(pkg_file):
"""Return a |PackageReader| instance loaded with contents of `pkg_file`."""
phys_reader = PhysPkgReader(pkg_file)
content_types = _ContentTypeMap.from_xml(phys_reader.content_types_xml)
pkg_srels = PackageReader._srels_for(phys_reader, PACKAGE_URI)
sparts = PackageReader._load_serialized_parts(
phys_reader, pkg_srels, content_types
)
phys_reader.close()
return PackageReader(content_types, pkg_srels, sparts)
......
实例属性_pkg_srels表示package_level级别的关系集合,实例属性_sparts表示序列化的part对象集合。一般不直接创建PackageReader实例,而是通过from_file类方法创建实例。
from_file方法中,第一句本质是实例化_ZipPkgReader。
第二句是解析[Content_Types].xml
类方法_srels_for定义如下:
@staticmethod
def _srels_for(phys_reader, source_uri):
"""Return |_SerializedRelationships| instance populated with relationships for
source identified by `source_uri`."""
rels_xml = phys_reader.rels_xml_for(source_uri)
return _SerializedRelationships.load_from_xml(source_uri.baseURI, rels_xml)
类方法_load_serialized_parts的定义如下:
@staticmethod
def _load_serialized_parts(phys_reader, pkg_srels, content_types):
"""Return a list of |_SerializedPart| instances corresponding to the parts in
`phys_reader` accessible by walking the relationship graph starting with
`pkg_srels`."""
sparts = []
part_walker = PackageReader._walk_phys_parts(phys_reader, pkg_srels)
for partname, blob, reltype, srels in part_walker:
content_type = content_types[partname]
spart = _SerializedPart(partname, content_type, reltype, blob, srels)
sparts.append(spart)
return tuple(sparts)
该方法从package_level级别的关系出发,根据深度优先的策略,返回序列化的part对象列表。
深度优先的实现定义在类方法_walk_phys_parts:
@staticmethod
def _walk_phys_parts(phys_reader, srels, visited_partnames=None):
"""Generate a 4-tuple `(partname, blob, reltype, srels)` for each of the parts
in `phys_reader` by walking the relationship graph rooted at srels."""
if visited_partnames is None:
visited_partnames = []
for srel in srels:
if srel.is_external:
continue
partname = srel.target_partname
if partname in visited_partnames:
continue
visited_partnames.append(partname)
reltype = srel.reltype
part_srels = PackageReader._srels_for(phys_reader, partname)
blob = phys_reader.blob_for(partname)
yield (partname, blob, reltype, part_srels)
next_walker = PackageReader._walk_phys_parts(
phys_reader, part_srels, visited_partnames
)
for partname, blob, reltype, srels in next_walker:
yield (partname, blob, reltype, srels)
其中最核心的部分在:1)part_srels = PackageReader._srels_for(phys_reader, partname),用于加载part_level级别的关系集合;2)next_walker = PackageReader._walk_phys_parts(phys_reader, part_srels, visited_partnames),如果part_level级别的关系集合不为空,则将该part作为一个根节点,创建与其关联的其它序列化part对象。
将OPC物理包封装的序列化part与关系持久化,相对读取过程较为容易。仍然需要先处理“[ContentType].xml”文件。处理“[ContentType].xml”文件的逻辑定义在_ContentTypesItem:
class _ContentTypesItem:
"""Service class that composes a content types item ([Content_Types].xml) based on a
list of parts.
Not meant to be instantiated directly, its single interface method is xml_for(),
e.g. ``_ContentTypesItem.xml_for(parts)``.
"""
def __init__(self):
self._defaults = CaseInsensitiveDict()
self._overrides = {}
def _add_content_type(self, partname, content_type):
"""Add a content type for the part with `partname` and `content_type`, using a
default or override as appropriate."""
ext = partname.ext
if (ext.lower(), content_type) in default_content_types:
self._defaults[ext] = content_type
else:
self._overrides[partname] = content_type
@classmethod
def from_parts(cls, parts):
"""Return content types XML mapping each part in `parts` to the appropriate
content type and suitable for storage as ``[Content_Types].xml`` in an OPC
package."""
cti = cls()
cti._defaults["rels"] = CT.OPC_RELATIONSHIPS
cti._defaults["xml"] = CT.XML
for part in parts:
cti._add_content_type(part.partname, part.content_type)
return cti
处理完content_types,持久化序列化的关系与part节点的逻辑定义于PackageWriter:
class PackageWriter:
"""Writes a zip-format OPC package to `pkg_file`, where `pkg_file` can be either a
path to a zip file (a string) or a file-like object.
Its single API method, :meth:`write`, is static, so this class is not intended to be
instantiated.
"""
@staticmethod
def write(pkg_file, pkg_rels, parts):
"""Write a physical package (.pptx file) to `pkg_file` containing `pkg_rels` and
`parts` and a content types stream based on the content types of the parts."""
phys_writer = PhysPkgWriter(pkg_file)
PackageWriter._write_content_types_stream(phys_writer, parts)
PackageWriter._write_pkg_rels(phys_writer, pkg_rels)
PackageWriter._write_parts(phys_writer, parts)
phys_writer.close()
@staticmethod
def _write_content_types_stream(phys_writer, parts):
"""Write ``[Content_Types].xml`` part to the physical package with an
appropriate content type lookup target for each part in `parts`."""
cti = _ContentTypesItem.from_parts(parts)
phys_writer.write(CONTENT_TYPES_URI, cti.blob)
@staticmethod
def _write_parts(phys_writer, parts):
"""Write the blob of each part in `parts` to the package, along with a rels item
for its relationships if and only if it has any."""
for part in parts:
phys_writer.write(part.partname, part.blob)
if len(part._rels):
phys_writer.write(part.partname.rels_uri, part._rels.xml)
@staticmethod
def _write_pkg_rels(phys_writer, pkg_rels):
"""Write the XML rels item for `pkg_rels` ('/_rels/.rels') to the package."""
phys_writer.write(PACKAGE_URI.rels_uri, pkg_rels.xml)
在执行docx.api.Document(filepath)
时,如果抛出以上异常,通常原因是因为’word/_rels/document.xml.rels’文件中,某一CT_Relationship的Target属性值为“NULL”,因此可以重定义_SerializedRelationships的类方法load_from_xml,解决方案如下:
from docx.opc.oxml import parse_xml
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
def load_from_xml(baseURI, rels_item_xml):
"""Return |_SerializedRelationships| instance loaded with the relationships
contained in `rels_item_xml`.
Returns an empty collection if `rels_item_xml` is |None|.
"""
srels = _SerializedRelationships()
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.get("Target") == "NULL":
continue # 忽略此类part节点
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
# 覆盖原定义
setattr(_SerializedRelationships, "load_from_xml", load_from_xml)
本文结合OPC标准与docx三方库源码,对如何从docx格式文件中抽取物理包对象、以及将物理包对象写入docx格式文件的过程进行了记录。首先介绍了OPC遵循ZIP标准,其次对读取物理包进行分解——创建序列化关系及序列化parts集合,然后对写入物理包流程进行介绍,最后对一种docx抛出的常见异常给出了优化方法。通过本文可以加深对OPC物理包的认知、及为后续的OPC抽象包奠定基础。