forked from ietf-tools/datatracker
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmldraft.py
248 lines (215 loc) · 9.17 KB
/
xmldraft.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# Copyright The IETF Trust 2022, All Rights Reserved
# -*- coding: utf-8 -*-
import datetime
import io
import re
import xml2rfc
import debug # pyflakes: ignore
from contextlib import ExitStack
from lxml.etree import XMLSyntaxError
from xml2rfc.util.date import augment_date, extract_date
from ietf.utils.timezone import date_today
from .draft import Draft
class XMLDraft(Draft):
"""Draft from XML source
Not all methods from the superclass are implemented yet.
"""
def __init__(self, xml_file):
"""Initialize XMLDraft instance
:parameter xml_file: path to file containing XML source
"""
super().__init__()
# cast xml_file to str so, e.g., this will work with a Path
self.xmltree, self.xml_version = self.parse_xml(str(xml_file))
self.xmlroot = self.xmltree.getroot()
self.filename, self.revision = self.parse_docname(self.xmlroot)
@staticmethod
def parse_xml(filename):
"""Parse XML draft
Converts to xml2rfc v3 schema, then returns the root of the v3 tree and the original
xml version.
"""
orig_write_out = xml2rfc.log.write_out
orig_write_err = xml2rfc.log.write_err
parser_out = io.StringIO()
parser_err = io.StringIO()
with ExitStack() as stack:
@stack.callback
def cleanup(): # called when context exited, even if there's an exception
xml2rfc.log.write_out = orig_write_out
xml2rfc.log.write_err = orig_write_err
xml2rfc.log.write_out = parser_out
xml2rfc.log.write_err = parser_err
parser = xml2rfc.XmlRfcParser(filename, quiet=True)
try:
tree = parser.parse()
except XMLSyntaxError:
raise InvalidXMLError()
except Exception as e:
raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
xml_version = tree.getroot().get('version', '2')
if xml_version == '2':
v2v3 = xml2rfc.V2v3XmlWriter(tree)
tree.tree = v2v3.convert2to3()
return tree, xml_version
def _document_name(self, ref):
"""Get document name from reference."""
series = ["rfc", "bcp", "fyi", "std"]
# handle xinclude first
# FIXME: this assumes the xinclude is a bibxml href; if it isn't, there can
# still be false negatives. it would be better to expand the xinclude and parse
# its seriesInfo.
if ref.tag.endswith("}include"):
name = re.search(
rf"reference\.({'|'.join(series).upper()})\.(\d{{4}})\.xml",
ref.attrib["href"],
)
if name:
return f"{name.group(1)}{int(name.group(2))}".lower()
name = re.search(
r"reference\.I-D\.(?:draft-)?(.*)\.xml", ref.attrib["href"]
)
if name:
return f"draft-{name.group(1)}"
# can't extract the name, give up
return ""
# check the anchor next
anchor = ref.get("anchor").lower() # always give back lowercase
label = anchor.rstrip("0123456789") # remove trailing digits
maybe_number = anchor[len(label) :]
if label in series and maybe_number.isdigit():
number = int(maybe_number)
return f"{label}{number}"
# if we couldn't find a match so far, try the seriesInfo
series_query = " or ".join(f"@name='{x.upper()}'" for x in series)
for info in ref.xpath(
f"./seriesInfo[{series_query} or @name='Internet-Draft']"
):
if not info.attrib["value"]:
continue
if info.attrib["name"] == "Internet-Draft":
return info.attrib["value"]
else:
return f'{info.attrib["name"].lower()}{info.attrib["value"]}'
return ""
def _reference_section_type(self, section_name):
"""Determine reference type from name of references section"""
if section_name:
section_name = section_name.lower()
if 'normative' in section_name:
return self.REF_TYPE_NORMATIVE
elif 'informative' in section_name:
return self.REF_TYPE_INFORMATIVE
return self.REF_TYPE_UNKNOWN
def _reference_section_name(self, section_elt):
section_name = section_elt.findtext('name')
if section_name is None and 'title' in section_elt.keys():
section_name = section_elt.get('title') # fall back to title if we have it
return section_name
@staticmethod
def parse_docname(xmlroot):
docname = xmlroot.attrib.get('docName')
if docname is None:
raise ValueError("Missing docName attribute in the XML root element")
revmatch = re.match(
r'^(?P<filename>.+?)(?:-(?P<rev>[0-9][0-9]))?$',
docname,
)
if revmatch is None:
raise ValueError('Unable to parse docName')
# If a group had no match it is None
return revmatch.group('filename'), revmatch.group('rev')
def get_title(self):
return self.xmlroot.findtext('front/title').strip()
@staticmethod
def parse_creation_date(date_elt):
if date_elt is None:
return None
today = date_today()
# ths mimics handling of date elements in the xml2rfc text/html writers
year, month, day = extract_date(date_elt, today)
year, month, day = augment_date(year, month, day, today)
if not day:
# Must choose a day for a datetime.date. Per RFC 7991 sect 2.17, we use
# today's date if it is consistent with the rest of the date. Otherwise,
# arbitrariy (and consistent with the text parser) assume the 15th.
if year == today.year and month == today.month:
day = today.day
else:
day = 15
return datetime.date(year, month, day)
def get_creation_date(self):
return self.parse_creation_date(self.xmlroot.find("front/date"))
# todo fix the implementation of XMLDraft.get_abstract()
#
# This code was pulled from ietf.submit.forms where it existed for some time.
# It does not work, at least with modern xml2rfc. This assumes that the abstract
# is simply text in the front/abstract node, but the XML schema wraps the actual
# abstract text in <t> elements (and allows <dl>, <ol>, and <ul> as well). As a
# result, this method normally returns an empty string, which is later replaced by
# the abstract parsed from the rendered text. For now, I a commenting this out
# and making it explicit that the abstract always comes from the text format.
#
# def get_abstract(self):
# """Extract the abstract"""
# abstract = self.xmlroot.findtext('front/abstract')
# return abstract.strip() if abstract else ''
def get_author_list(self):
"""Get detailed author list
Returns a list of dicts with the following keys:
name, first_name, middle_initial, last_name,
name_suffix, email, country, affiliation
Values will be None if not available
"""
result = []
empty_author = {
k: None for k in [
'name', 'first_name', 'middle_initial', 'last_name',
'name_suffix', 'email', 'country', 'affiliation',
]
}
for author in self.xmlroot.findall('front/author'):
info = {
'name': author.attrib.get('fullname'),
'email': author.findtext('address/email'),
'affiliation': author.findtext('organization'),
}
elem = author.find('address/postal/country')
if elem is not None:
ascii_country = elem.get('ascii', None)
info['country'] = ascii_country if ascii_country else elem.text
for item in info:
if info[item]:
info[item] = info[item].strip()
result.append(empty_author | info) # merge, preferring info
return result
def get_refs(self):
"""Extract references from the draft"""
refs = {}
# accept nested <references> sections
for section in self.xmlroot.findall("back//references"):
ref_type = self._reference_section_type(
self._reference_section_name(section)
)
for ref in (
section.findall("./reference")
+ section.findall("./referencegroup")
+ section.findall(
"./xi:include", {"xi": "http://www.w3.org/2001/XInclude"}
)
):
name = self._document_name(ref)
if name:
refs[name] = ref_type
return refs
class XMLParseError(Exception):
"""An error occurred while parsing"""
def __init__(self, out: str, err: str, *args):
super().__init__(*args)
self._out = out
self._err = err
def parser_msgs(self):
return self._out.splitlines() + self._err.splitlines()
class InvalidXMLError(Exception):
"""File is not valid XML"""
pass