phml.parser

Pythonic Hypertext Markup Language (phml) parser.

  1"""Pythonic Hypertext Markup Language (phml) parser."""
  2import re
  3from copy import deepcopy
  4from operator import itemgetter
  5
  6from .nodes import (
  7    AST,
  8    Attribute,
  9    Element,
 10    Literal,
 11    LiteralType,
 12    Parent,
 13    Point,
 14    Position,
 15)
 16
 17
 18def strip(data: str, cur_tags: list[str]) -> str:
 19    """This function takes a possibly multiline string and strips leading and trailing
 20    blank lines. Given the current tag stack it will not strip the text if it is nested
 21    in a `pre` tag.
 22    """
 23    if len(cur_tags) > 0 and (
 24        cur_tags[-1] == "python"
 25        or cur_tags[-1] == "script"
 26        or cur_tags[-1] == "style"
 27        or "pre" in cur_tags
 28    ):
 29        return data
 30    return data.strip()
 31
 32
 33self_closing = [
 34    "area",
 35    "base",
 36    "br",
 37    "col",
 38    "embed",
 39    "hr",
 40    "img",
 41    "input",
 42    "link",
 43    "meta",
 44    "param",
 45    "source",
 46    "track",
 47    "wbr",
 48    "command",
 49    "keygen",
 50    "menuitem",
 51    "Slot",
 52    "Markdown",
 53]
 54
 55
 56# Main form of tokenization
 57class RE:
 58    tag_start = re.compile(
 59        r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)",
 60    )
 61    """Matches the start of a tag `<!name|</name|<name`"""
 62
 63    tag_end = re.compile(r"(?P<closing>/?)>")
 64    """Matches the end of a tag `/>|>`."""
 65
 66    comment = re.compile(r"<!--((?:.|\s)*)-->")
 67    """Matches all html style comments `<!--Comment-->`."""
 68    comment_close = re.compile(r"-->")
 69
 70    attribute = re.compile(
 71        r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?",
 72    )
 73    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
 74
 75    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
 76
 77
 78class HypertextMarkupParser:
 79    """Parse html/xml like source code strings."""
 80
 81    tag_stack = []
 82    """Current stack of tags in order of when they are opened."""
 83    in_pre: int = 0
 84    """Whether the current element context is inside a pre element."""
 85
 86    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
 87        """Calculate the number of lines and columns that lead to the starting point int he source
 88        string.
 89        """
 90        source = source[:start]
 91        return (
 92            source.count("\n"),
 93            len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0,
 94        )
 95
 96    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
 97        """Calculate whether the number of columns should be added to the current column or be
 98        treated as if it is starting from zero based on whether new lines exist.
 99        """
100        return num_cols if num_lines != 0 else init_cols + num_cols
101
102    def __parse_text(self, text: str, pos: Position) -> Literal | None:
103        """Parse the comments and general text found in the provided source."""
104
105        if len(text) > 0 and strip(text, self.tag_stack) != "":
106            line, col = self.__calc_line_col(text, len(text))
107            pos.start.line += line
108            pos.start.column = col
109
110            pos.end.line += line
111            pos.end.column = self.__calc_col(line, col, pos.end.column)
112            return Literal(
113                LiteralType.Text,
114                strip(text, self.tag_stack),
115                position=Position.from_pos(pos),
116                in_pre=self.in_pre > 0,
117            )
118
119        return None
120
121    def __parse_attributes(self, attrs: str) -> dict[str, Attribute]:
122        """Parse a tags attributes from the text found between the tag start and the tag end.
123
124        Example:
125            `<name (attributes)>`
126        """
127        attributes = {}
128        for attr in RE.attribute.finditer(attrs):
129            (name, value, _, double, single, no_bracket) = itemgetter(
130                "name",
131                "value",
132                "curly",
133                "double",
134                "single",
135                "open",
136            )(attr.groupdict())
137
138            value = double or single or no_bracket
139
140            if value in ["yes", "true", None]:
141                value = True
142            elif value in ["no", "false"]:
143                value = False
144
145            attributes[name] = value
146        return attributes
147
148    def __parse_tag(self, source, position: Position):
149        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
150        It will also parse any comments and text from the start of the source to the start of the
151        tag.
152        """
153        begin = RE.tag_start.search(source)
154        begin = (begin.start(), begin.group(0), begin.groupdict())
155
156        elem = None
157        if begin[0] > 0:
158            elem = self.__parse_text(source[: begin[0]], position)
159
160        position.end.column = position.start.column + len(begin[1])
161        source = source[begin[0] + len(begin[1]) :]
162
163        if begin[2]["comment"] is not None:
164            end = RE.comment_close.search(source)
165            if end is None:
166                raise Exception("Comment was not closed")
167            end = (end.start(), end.group(0), end.groupdict())
168            attributes: dict[str, Attribute] = {"data": source[: end[0]]}
169        else:
170            begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
171            end = RE.tag_end.search(source)
172            if end is None:
173                raise Exception(
174                    f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.",
175                )
176            end = (end.start(), end.group(0), end.groupdict())
177            if begin[2]["opening"] == "/" and "<" in source[: end[0]]:
178                line, col = self.__calc_line_col(source, end[0] + len(end[1]))
179                position.end.line = position.start.line + line
180                position.end.column = position.end.column + col
181                raise Exception(
182                    f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol"
183                )
184            attributes = self.__parse_attributes(source[: end[0]])
185
186        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
187        position.end.line = position.start.line + line
188        position.end.column = position.end.column + col
189
190        return source[end[0] + len(end[1]) :], begin, attributes, end, elem
191
192    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
193        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
194
195        if auto_closing:
196            return name in self_closing
197        return False  # pragma: no cover
198
199    def parse(self, source: str, auto_close: bool = True) -> AST:
200        """Parse a given html or phml string into it's corresponding phml ast.
201
202        Args:
203            source (str): The html or phml source to parse.
204
205        Returns:
206            AST: A phml AST representing the parsed code source.
207        """
208
209        self.tag_stack = []
210        current = AST()
211        position = Position((0, 0), (0, 0))
212
213        while RE.tag_start.search(source) is not None and current is not None:
214            source, begin, attr, end, elem = self.__parse_tag(source, position)
215
216            if elem is not None:
217                current.append(elem)
218
219            if begin[2]["comment"] is not None:
220                current.append(
221                    Literal(
222                        LiteralType.Comment,
223                        str(attr["data"]),
224                        position=Position.from_pos(position),
225                        in_pre=self.in_pre > 0,
226                    ),
227                )
228            else:
229                name = begin[2]["name"] or ""
230                if begin[2]["opening"] == "/":
231                    if len(self.tag_stack) == 0:
232                        raise Exception(
233                            f"Unbalanced tags: Tag was closed without first being opened at {position}",
234                        )
235                    elif name != self.tag_stack[-1]:
236                        print("Tag Stack", self.tag_stack)
237                        raise Exception(
238                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}",
239                        )
240
241                    ptag = self.tag_stack.pop()
242                    if ptag == "pre":
243                        self.in_pre -= 1
244
245                    if current.position is not None:
246                        current.position.end.line = position.end.line
247                        current.position.end.column = position.end.column
248
249                    current = current.parent
250                elif begin[2]["opening"] == "!":
251                    current.append(
252                        Element(
253                            "doctype",
254                            {"lang": attr.get("lang", "html")},
255                            position=Position.from_pos(position),
256                        ),
257                    )
258                elif (
259                    end[2]["closing"] != "/"
260                    and not self.is_self_closing(name, auto_close)
261                    and begin[2]["opening"] is None
262                ):
263                    self.tag_stack.append(name)
264                    if name == "pre":
265                        self.in_pre += 1
266                    current.append(
267                        Element(
268                            name,
269                            attr,
270                            [],
271                            position=Position.from_pos(position),
272                            in_pre=self.in_pre > 0,
273                        ),
274                    )
275                    if len(current) > 0:
276                        current = current[-1]
277                else:
278                    current.append(
279                        Element(
280                            name,
281                            attr,
282                            position=deepcopy(position),
283                            in_pre=self.in_pre > 0,
284                        ),
285                    )
286
287            position.start = Point(position.end.line, position.end.column)
288
289        if len(source) > 0:
290            elem = self.__parse_text(source, position)
291            if (
292                current is not None
293                and isinstance(current, Parent)
294                and current.children is not None
295                and elem is not None
296            ):
297                current.append(elem)
298
299        if len(self.tag_stack) > 0:
300            raise Exception(
301                f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}",
302            )
303        return current
def strip(data: str, cur_tags: list[str]) -> str:
19def strip(data: str, cur_tags: list[str]) -> str:
20    """This function takes a possibly multiline string and strips leading and trailing
21    blank lines. Given the current tag stack it will not strip the text if it is nested
22    in a `pre` tag.
23    """
24    if len(cur_tags) > 0 and (
25        cur_tags[-1] == "python"
26        or cur_tags[-1] == "script"
27        or cur_tags[-1] == "style"
28        or "pre" in cur_tags
29    ):
30        return data
31    return data.strip()

This function takes a possibly multiline string and strips leading and trailing blank lines. Given the current tag stack it will not strip the text if it is nested in a pre tag.

class RE:
58class RE:
59    tag_start = re.compile(
60        r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)",
61    )
62    """Matches the start of a tag `<!name|</name|<name`"""
63
64    tag_end = re.compile(r"(?P<closing>/?)>")
65    """Matches the end of a tag `/>|>`."""
66
67    comment = re.compile(r"<!--((?:.|\s)*)-->")
68    """Matches all html style comments `<!--Comment-->`."""
69    comment_close = re.compile(r"-->")
70
71    attribute = re.compile(
72        r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?",
73    )
74    """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`."""
75
76    bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
RE()
tag_start = re.compile('(?P<comment><!--)|<(?!!--)(?P<opening>!|\\/)?(?P<name>([\\w:\\.]+\\-?)+)|<(?P<opening2>/)?(?=\\s+>|>)')

Matches the start of a tag <!name|</name|<name

tag_end = re.compile('(?P<closing>/?)>')

Matches the end of a tag />|>.

comment = re.compile('<!--((?:.|\\s)*)-->')

Matches all html style comments <!--Comment-->.

attribute = re.compile('(?P<name>[\\w:\\-@]+)(?:=(?P<value>\\{(?P<curly>[^\\}]*)\\/\\}|\\"(?P<double>[^\\"]*)\\"|\'(?P<single>[^\']*)\'|(?P<open>[^>\'\\"\\s]+)))?')

Matches a tags attributes attr|attr=value|attr='value'|attr="value".

class HypertextMarkupParser:
 79class HypertextMarkupParser:
 80    """Parse html/xml like source code strings."""
 81
 82    tag_stack = []
 83    """Current stack of tags in order of when they are opened."""
 84    in_pre: int = 0
 85    """Whether the current element context is inside a pre element."""
 86
 87    def __calc_line_col(self, source: str, start: int) -> tuple[int, int]:
 88        """Calculate the number of lines and columns that lead to the starting point int he source
 89        string.
 90        """
 91        source = source[:start]
 92        return (
 93            source.count("\n"),
 94            len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0,
 95        )
 96
 97    def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int:
 98        """Calculate whether the number of columns should be added to the current column or be
 99        treated as if it is starting from zero based on whether new lines exist.
100        """
101        return num_cols if num_lines != 0 else init_cols + num_cols
102
103    def __parse_text(self, text: str, pos: Position) -> Literal | None:
104        """Parse the comments and general text found in the provided source."""
105
106        if len(text) > 0 and strip(text, self.tag_stack) != "":
107            line, col = self.__calc_line_col(text, len(text))
108            pos.start.line += line
109            pos.start.column = col
110
111            pos.end.line += line
112            pos.end.column = self.__calc_col(line, col, pos.end.column)
113            return Literal(
114                LiteralType.Text,
115                strip(text, self.tag_stack),
116                position=Position.from_pos(pos),
117                in_pre=self.in_pre > 0,
118            )
119
120        return None
121
122    def __parse_attributes(self, attrs: str) -> dict[str, Attribute]:
123        """Parse a tags attributes from the text found between the tag start and the tag end.
124
125        Example:
126            `<name (attributes)>`
127        """
128        attributes = {}
129        for attr in RE.attribute.finditer(attrs):
130            (name, value, _, double, single, no_bracket) = itemgetter(
131                "name",
132                "value",
133                "curly",
134                "double",
135                "single",
136                "open",
137            )(attr.groupdict())
138
139            value = double or single or no_bracket
140
141            if value in ["yes", "true", None]:
142                value = True
143            elif value in ["no", "false"]:
144                value = False
145
146            attributes[name] = value
147        return attributes
148
149    def __parse_tag(self, source, position: Position):
150        """Parse a tag from the given source. This includes the tag start, attributes and tag end.
151        It will also parse any comments and text from the start of the source to the start of the
152        tag.
153        """
154        begin = RE.tag_start.search(source)
155        begin = (begin.start(), begin.group(0), begin.groupdict())
156
157        elem = None
158        if begin[0] > 0:
159            elem = self.__parse_text(source[: begin[0]], position)
160
161        position.end.column = position.start.column + len(begin[1])
162        source = source[begin[0] + len(begin[1]) :]
163
164        if begin[2]["comment"] is not None:
165            end = RE.comment_close.search(source)
166            if end is None:
167                raise Exception("Comment was not closed")
168            end = (end.start(), end.group(0), end.groupdict())
169            attributes: dict[str, Attribute] = {"data": source[: end[0]]}
170        else:
171            begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"]
172            end = RE.tag_end.search(source)
173            if end is None:
174                raise Exception(
175                    f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.",
176                )
177            end = (end.start(), end.group(0), end.groupdict())
178            if begin[2]["opening"] == "/" and "<" in source[: end[0]]:
179                line, col = self.__calc_line_col(source, end[0] + len(end[1]))
180                position.end.line = position.start.line + line
181                position.end.column = position.end.column + col
182                raise Exception(
183                    f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol"
184                )
185            attributes = self.__parse_attributes(source[: end[0]])
186
187        line, col = self.__calc_line_col(source, end[0] + len(end[1]))
188        position.end.line = position.start.line + line
189        position.end.column = position.end.column + col
190
191        return source[end[0] + len(end[1]) :], begin, attributes, end, elem
192
193    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
194        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
195
196        if auto_closing:
197            return name in self_closing
198        return False  # pragma: no cover
199
200    def parse(self, source: str, auto_close: bool = True) -> AST:
201        """Parse a given html or phml string into it's corresponding phml ast.
202
203        Args:
204            source (str): The html or phml source to parse.
205
206        Returns:
207            AST: A phml AST representing the parsed code source.
208        """
209
210        self.tag_stack = []
211        current = AST()
212        position = Position((0, 0), (0, 0))
213
214        while RE.tag_start.search(source) is not None and current is not None:
215            source, begin, attr, end, elem = self.__parse_tag(source, position)
216
217            if elem is not None:
218                current.append(elem)
219
220            if begin[2]["comment"] is not None:
221                current.append(
222                    Literal(
223                        LiteralType.Comment,
224                        str(attr["data"]),
225                        position=Position.from_pos(position),
226                        in_pre=self.in_pre > 0,
227                    ),
228                )
229            else:
230                name = begin[2]["name"] or ""
231                if begin[2]["opening"] == "/":
232                    if len(self.tag_stack) == 0:
233                        raise Exception(
234                            f"Unbalanced tags: Tag was closed without first being opened at {position}",
235                        )
236                    elif name != self.tag_stack[-1]:
237                        print("Tag Stack", self.tag_stack)
238                        raise Exception(
239                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}",
240                        )
241
242                    ptag = self.tag_stack.pop()
243                    if ptag == "pre":
244                        self.in_pre -= 1
245
246                    if current.position is not None:
247                        current.position.end.line = position.end.line
248                        current.position.end.column = position.end.column
249
250                    current = current.parent
251                elif begin[2]["opening"] == "!":
252                    current.append(
253                        Element(
254                            "doctype",
255                            {"lang": attr.get("lang", "html")},
256                            position=Position.from_pos(position),
257                        ),
258                    )
259                elif (
260                    end[2]["closing"] != "/"
261                    and not self.is_self_closing(name, auto_close)
262                    and begin[2]["opening"] is None
263                ):
264                    self.tag_stack.append(name)
265                    if name == "pre":
266                        self.in_pre += 1
267                    current.append(
268                        Element(
269                            name,
270                            attr,
271                            [],
272                            position=Position.from_pos(position),
273                            in_pre=self.in_pre > 0,
274                        ),
275                    )
276                    if len(current) > 0:
277                        current = current[-1]
278                else:
279                    current.append(
280                        Element(
281                            name,
282                            attr,
283                            position=deepcopy(position),
284                            in_pre=self.in_pre > 0,
285                        ),
286                    )
287
288            position.start = Point(position.end.line, position.end.column)
289
290        if len(source) > 0:
291            elem = self.__parse_text(source, position)
292            if (
293                current is not None
294                and isinstance(current, Parent)
295                and current.children is not None
296                and elem is not None
297            ):
298                current.append(elem)
299
300        if len(self.tag_stack) > 0:
301            raise Exception(
302                f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}",
303            )
304        return current

Parse html/xml like source code strings.

HypertextMarkupParser()
tag_stack = []

Current stack of tags in order of when they are opened.

in_pre: int = 0

Whether the current element context is inside a pre element.

def is_self_closing(self, name: str, auto_closing: bool) -> bool:
193    def is_self_closing(self, name: str, auto_closing: bool) -> bool:
194        """Check if the tag is self closing. Only check if auto_closing is toggled on."""
195
196        if auto_closing:
197            return name in self_closing
198        return False  # pragma: no cover

Check if the tag is self closing. Only check if auto_closing is toggled on.

def parse(self, source: str, auto_close: bool = True) -> phml.nodes.AST:
200    def parse(self, source: str, auto_close: bool = True) -> AST:
201        """Parse a given html or phml string into it's corresponding phml ast.
202
203        Args:
204            source (str): The html or phml source to parse.
205
206        Returns:
207            AST: A phml AST representing the parsed code source.
208        """
209
210        self.tag_stack = []
211        current = AST()
212        position = Position((0, 0), (0, 0))
213
214        while RE.tag_start.search(source) is not None and current is not None:
215            source, begin, attr, end, elem = self.__parse_tag(source, position)
216
217            if elem is not None:
218                current.append(elem)
219
220            if begin[2]["comment"] is not None:
221                current.append(
222                    Literal(
223                        LiteralType.Comment,
224                        str(attr["data"]),
225                        position=Position.from_pos(position),
226                        in_pre=self.in_pre > 0,
227                    ),
228                )
229            else:
230                name = begin[2]["name"] or ""
231                if begin[2]["opening"] == "/":
232                    if len(self.tag_stack) == 0:
233                        raise Exception(
234                            f"Unbalanced tags: Tag was closed without first being opened at {position}",
235                        )
236                    elif name != self.tag_stack[-1]:
237                        print("Tag Stack", self.tag_stack)
238                        raise Exception(
239                            f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}",
240                        )
241
242                    ptag = self.tag_stack.pop()
243                    if ptag == "pre":
244                        self.in_pre -= 1
245
246                    if current.position is not None:
247                        current.position.end.line = position.end.line
248                        current.position.end.column = position.end.column
249
250                    current = current.parent
251                elif begin[2]["opening"] == "!":
252                    current.append(
253                        Element(
254                            "doctype",
255                            {"lang": attr.get("lang", "html")},
256                            position=Position.from_pos(position),
257                        ),
258                    )
259                elif (
260                    end[2]["closing"] != "/"
261                    and not self.is_self_closing(name, auto_close)
262                    and begin[2]["opening"] is None
263                ):
264                    self.tag_stack.append(name)
265                    if name == "pre":
266                        self.in_pre += 1
267                    current.append(
268                        Element(
269                            name,
270                            attr,
271                            [],
272                            position=Position.from_pos(position),
273                            in_pre=self.in_pre > 0,
274                        ),
275                    )
276                    if len(current) > 0:
277                        current = current[-1]
278                else:
279                    current.append(
280                        Element(
281                            name,
282                            attr,
283                            position=deepcopy(position),
284                            in_pre=self.in_pre > 0,
285                        ),
286                    )
287
288            position.start = Point(position.end.line, position.end.column)
289
290        if len(source) > 0:
291            elem = self.__parse_text(source, position)
292            if (
293                current is not None
294                and isinstance(current, Parent)
295                and current.children is not None
296                and elem is not None
297            ):
298                current.append(elem)
299
300        if len(self.tag_stack) > 0:
301            raise Exception(
302                f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}",
303            )
304        return current

Parse a given html or phml string into it's corresponding phml ast.

Args
  • source (str): The html or phml source to parse.
Returns

AST: A phml AST representing the parsed code source.