1
0
forked from 0ad/0ad

Simplify XML parsing by iterating only once

This simplifies the XML parsing, by iterating over the DOM tree only
once. Curiously this doesn't result in significant performance gains.

As the keywords are now found in the order they appear in the
document instead of the order they are mentioned in messages.json, the
order of a few strings in the PO-templates changes caused by the changes
in this commit.
This commit is contained in:
Dunedan 2024-09-07 06:38:54 +02:00
parent eeb502c115
commit 0e84957979
Signed by untrusted user: Dunedan
GPG Key ID: 885B16854284E0B2

View File

@ -452,38 +452,40 @@ class XmlExtractor(Extractor):
def extract_from_file(self, filepath): def extract_from_file(self, filepath):
with open(filepath, encoding="utf-8-sig") as file_object: with open(filepath, encoding="utf-8-sig") as file_object:
xml_document = etree.parse(file_object) xml_document = etree.parse(file_object)
for keyword in self.keywords:
for element in xml_document.iter(keyword):
lineno = element.sourceline
if element.text is None:
continue
comments = [] for element in xml_document.iter(*self.keywords.keys()):
if "extractJson" in self.keywords[keyword]: keyword = element.tag
json_extractor = self.get_json_extractor()
json_extractor.set_options(self.keywords[keyword]["extractJson"]) lineno = element.sourceline
for message, context in json_extractor.extract_from_string(element.text): if element.text is None:
yield message, None, context, lineno, comments continue
else:
context = None comments = []
if "context" in element.attrib: if "extractJson" in self.keywords[keyword]:
context = str(element.get("context")) json_extractor = self.get_json_extractor()
elif "tagAsContext" in self.keywords[keyword]: json_extractor.set_options(self.keywords[keyword]["extractJson"])
context = keyword for message, context in json_extractor.extract_from_string(element.text):
elif "customContext" in self.keywords[keyword]: yield message, None, context, lineno, comments
context = self.keywords[keyword]["customContext"] else:
if "comment" in element.attrib: context = None
comment = element.get("comment") if "context" in element.attrib:
comment = " ".join( context = str(element.get("context"))
comment.split() elif "tagAsContext" in self.keywords[keyword]:
) # Remove tabs, line breaks and unecessary spaces. context = keyword
comments.append(comment) elif "customContext" in self.keywords[keyword]:
if "splitOnWhitespace" in self.keywords[keyword]: context = self.keywords[keyword]["customContext"]
for split_text in element.text.split(): if "comment" in element.attrib:
# split on whitespace is used for token lists, there, a comment = element.get("comment")
# leading '-' means the token has to be removed, so it's not comment = " ".join(
# to be processed here either comment.split()
if split_text[0] != "-": ) # Remove tabs, line breaks and unnecessary spaces.
yield str(split_text), None, context, lineno, comments comments.append(comment)
else: if "splitOnWhitespace" in self.keywords[keyword]:
yield str(element.text), None, context, lineno, comments for split_text in element.text.split():
# split on whitespace is used for token lists, there, a
# leading '-' means the token has to be removed, so it's not
# to be processed here either
if split_text[0] != "-":
yield str(split_text), None, context, lineno, comments
else:
yield str(element.text), None, context, lineno, comments