Simplify XML parsing by iterating only once

This simplifies the XML parsing, by iterating over the DOM tree only once. Curiously this doesn't result in significant performance gains. As the keywords are now found in the order they appear in the document instead of the order they are mentioned in messages.json, the order of a few strings in the PO-templates changes caused by the changes in this commit.
2024-09-07 06:38:54 +02:00 · 2024-09-07 06:38:54 +02:00 · 0e84957979
commit 0e84957979
parent eeb502c115
1 changed files with 36 additions and 34 deletions
--- a/source/tools/i18n/i18n_helper/extractors.py
+++ b/source/tools/i18n/i18n_helper/extractors.py
@ -452,38 +452,40 @@ class XmlExtractor(Extractor):
    def extract_from_file(self, filepath):
        with open(filepath, encoding="utf-8-sig") as file_object:
            xml_document = etree.parse(file_object)
            for keyword in self.keywords:
                for element in xml_document.iter(keyword):
                    lineno = element.sourceline
                    if element.text is None:
                        continue
-                    comments = []
+        for element in xml_document.iter(*self.keywords.keys()):
-                    if "extractJson" in self.keywords[keyword]:
+            keyword = element.tag
-                        json_extractor = self.get_json_extractor()
+
-                        json_extractor.set_options(self.keywords[keyword]["extractJson"])
+            lineno = element.sourceline
-                        for message, context in json_extractor.extract_from_string(element.text):
+            if element.text is None:
-                            yield message, None, context, lineno, comments
+                continue
-                    else:
+
-                        context = None
+            comments = []
-                        if "context" in element.attrib:
+            if "extractJson" in self.keywords[keyword]:
-                            context = str(element.get("context"))
+                json_extractor = self.get_json_extractor()
-                        elif "tagAsContext" in self.keywords[keyword]:
+                json_extractor.set_options(self.keywords[keyword]["extractJson"])
-                            context = keyword
+                for message, context in json_extractor.extract_from_string(element.text):
-                        elif "customContext" in self.keywords[keyword]:
+                    yield message, None, context, lineno, comments
-                            context = self.keywords[keyword]["customContext"]
+            else:
-                        if "comment" in element.attrib:
+                context = None
-                            comment = element.get("comment")
+                if "context" in element.attrib:
-                            comment = " ".join(
+                    context = str(element.get("context"))
-                                comment.split()
+                elif "tagAsContext" in self.keywords[keyword]:
-                            )  # Remove tabs, line breaks and unecessary spaces.
+                    context = keyword
-                            comments.append(comment)
+                elif "customContext" in self.keywords[keyword]:
-                        if "splitOnWhitespace" in self.keywords[keyword]:
+                    context = self.keywords[keyword]["customContext"]
-                            for split_text in element.text.split():
+                if "comment" in element.attrib:
-                                # split on whitespace is used for token lists, there, a
+                    comment = element.get("comment")
-                                # leading '-' means the token has to be removed, so it's not
+                    comment = " ".join(
-                                # to be processed here either
+                        comment.split()
-                                if split_text[0] != "-":
+                    )  # Remove tabs, line breaks and unnecessary spaces.
-                                    yield str(split_text), None, context, lineno, comments
+                    comments.append(comment)
-                        else:
+                if "splitOnWhitespace" in self.keywords[keyword]:
-                            yield str(element.text), None, context, lineno, comments
+                    for split_text in element.text.split():
                        # split on whitespace is used for token lists, there, a
                        # leading '-' means the token has to be removed, so it's not
                        # to be processed here either
                        if split_text[0] != "-":
                            yield str(split_text), None, context, lineno, comments
                else:
                    yield str(element.text), None, context, lineno, comments