phml.parser
Pythonic Hypertext Markup Language (phml) parser.
1"""Pythonic Hypertext Markup Language (phml) parser.""" 2import re 3from copy import deepcopy 4from operator import itemgetter 5 6from .nodes import ( 7 AST, 8 Attribute, 9 Element, 10 Literal, 11 LiteralType, 12 Parent, 13 Point, 14 Position, 15) 16 17 18def strip(data: str, cur_tags: list[str]) -> str: 19 """This function takes a possibly multiline string and strips leading and trailing 20 blank lines. Given the current tag stack it will not strip the text if it is nested 21 in a `pre` tag. 22 """ 23 if len(cur_tags) > 0 and ( 24 cur_tags[-1] == "python" 25 or cur_tags[-1] == "script" 26 or cur_tags[-1] == "style" 27 or "pre" in cur_tags 28 ): 29 return data 30 return data.strip() 31 32 33self_closing = [ 34 "area", 35 "base", 36 "br", 37 "col", 38 "embed", 39 "hr", 40 "img", 41 "input", 42 "link", 43 "meta", 44 "param", 45 "source", 46 "track", 47 "wbr", 48 "command", 49 "keygen", 50 "menuitem", 51 "Slot", 52 "Markdown", 53] 54 55 56# Main form of tokenization 57class RE: 58 tag_start = re.compile( 59 r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)", 60 ) 61 """Matches the start of a tag `<!name|</name|<name`""" 62 63 tag_end = re.compile(r"(?P<closing>/?)>") 64 """Matches the end of a tag `/>|>`.""" 65 66 comment = re.compile(r"<!--((?:.|\s)*)-->") 67 """Matches all html style comments `<!--Comment-->`.""" 68 comment_close = re.compile(r"-->") 69 70 attribute = re.compile( 71 r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?", 72 ) 73 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 74 75 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$") 76 77 78class HypertextMarkupParser: 79 """Parse html/xml like source code strings.""" 80 81 tag_stack = [] 82 """Current stack of tags in order of when they are opened.""" 83 in_pre: int = 0 84 """Whether the current element context is inside a pre element.""" 85 86 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 87 """Calculate the number of lines and columns that lead to the starting point int he source 88 string. 89 """ 90 source = source[:start] 91 return ( 92 source.count("\n"), 93 len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0, 94 ) 95 96 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 97 """Calculate whether the number of columns should be added to the current column or be 98 treated as if it is starting from zero based on whether new lines exist. 99 """ 100 return num_cols if num_lines != 0 else init_cols + num_cols 101 102 def __parse_text(self, text: str, pos: Position) -> Literal | None: 103 """Parse the comments and general text found in the provided source.""" 104 105 if len(text) > 0 and strip(text, self.tag_stack) != "": 106 line, col = self.__calc_line_col(text, len(text)) 107 pos.start.line += line 108 pos.start.column = col 109 110 pos.end.line += line 111 pos.end.column = self.__calc_col(line, col, pos.end.column) 112 return Literal( 113 LiteralType.Text, 114 strip(text, self.tag_stack), 115 position=Position.from_pos(pos), 116 in_pre=self.in_pre > 0, 117 ) 118 119 return None 120 121 def __parse_attributes(self, attrs: str) -> dict[str, Attribute]: 122 """Parse a tags attributes from the text found between the tag start and the tag end. 123 124 Example: 125 `<name (attributes)>` 126 """ 127 attributes = {} 128 for attr in RE.attribute.finditer(attrs): 129 (name, value, _, double, single, no_bracket) = itemgetter( 130 "name", 131 "value", 132 "curly", 133 "double", 134 "single", 135 "open", 136 )(attr.groupdict()) 137 138 value = double or single or no_bracket 139 140 if value in ["yes", "true", None]: 141 value = True 142 elif value in ["no", "false"]: 143 value = False 144 145 attributes[name] = value 146 return attributes 147 148 def __parse_tag(self, source, position: Position): 149 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 150 It will also parse any comments and text from the start of the source to the start of the 151 tag. 152 """ 153 begin = RE.tag_start.search(source) 154 begin = (begin.start(), begin.group(0), begin.groupdict()) 155 156 elem = None 157 if begin[0] > 0: 158 elem = self.__parse_text(source[: begin[0]], position) 159 160 position.end.column = position.start.column + len(begin[1]) 161 source = source[begin[0] + len(begin[1]) :] 162 163 if begin[2]["comment"] is not None: 164 end = RE.comment_close.search(source) 165 if end is None: 166 raise Exception("Comment was not closed") 167 end = (end.start(), end.group(0), end.groupdict()) 168 attributes: dict[str, Attribute] = {"data": source[: end[0]]} 169 else: 170 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 171 end = RE.tag_end.search(source) 172 if end is None: 173 raise Exception( 174 f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.", 175 ) 176 end = (end.start(), end.group(0), end.groupdict()) 177 if begin[2]["opening"] == "/" and "<" in source[: end[0]]: 178 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 179 position.end.line = position.start.line + line 180 position.end.column = position.end.column + col 181 raise Exception( 182 f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol" 183 ) 184 attributes = self.__parse_attributes(source[: end[0]]) 185 186 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 187 position.end.line = position.start.line + line 188 position.end.column = position.end.column + col 189 190 return source[end[0] + len(end[1]) :], begin, attributes, end, elem 191 192 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 193 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 194 195 if auto_closing: 196 return name in self_closing 197 return False # pragma: no cover 198 199 def parse(self, source: str, auto_close: bool = True) -> AST: 200 """Parse a given html or phml string into it's corresponding phml ast. 201 202 Args: 203 source (str): The html or phml source to parse. 204 205 Returns: 206 AST: A phml AST representing the parsed code source. 207 """ 208 209 self.tag_stack = [] 210 current = AST() 211 position = Position((0, 0), (0, 0)) 212 213 while RE.tag_start.search(source) is not None and current is not None: 214 source, begin, attr, end, elem = self.__parse_tag(source, position) 215 216 if elem is not None: 217 current.append(elem) 218 219 if begin[2]["comment"] is not None: 220 current.append( 221 Literal( 222 LiteralType.Comment, 223 str(attr["data"]), 224 position=Position.from_pos(position), 225 in_pre=self.in_pre > 0, 226 ), 227 ) 228 else: 229 name = begin[2]["name"] or "" 230 if begin[2]["opening"] == "/": 231 if len(self.tag_stack) == 0: 232 raise Exception( 233 f"Unbalanced tags: Tag was closed without first being opened at {position}", 234 ) 235 elif name != self.tag_stack[-1]: 236 print("Tag Stack", self.tag_stack) 237 raise Exception( 238 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}", 239 ) 240 241 ptag = self.tag_stack.pop() 242 if ptag == "pre": 243 self.in_pre -= 1 244 245 if current.position is not None: 246 current.position.end.line = position.end.line 247 current.position.end.column = position.end.column 248 249 current = current.parent 250 elif begin[2]["opening"] == "!": 251 current.append( 252 Element( 253 "doctype", 254 {"lang": attr.get("lang", "html")}, 255 position=Position.from_pos(position), 256 ), 257 ) 258 elif ( 259 end[2]["closing"] != "/" 260 and not self.is_self_closing(name, auto_close) 261 and begin[2]["opening"] is None 262 ): 263 self.tag_stack.append(name) 264 if name == "pre": 265 self.in_pre += 1 266 current.append( 267 Element( 268 name, 269 attr, 270 [], 271 position=Position.from_pos(position), 272 in_pre=self.in_pre > 0, 273 ), 274 ) 275 if len(current) > 0: 276 current = current[-1] 277 else: 278 current.append( 279 Element( 280 name, 281 attr, 282 position=deepcopy(position), 283 in_pre=self.in_pre > 0, 284 ), 285 ) 286 287 position.start = Point(position.end.line, position.end.column) 288 289 if len(source) > 0: 290 elem = self.__parse_text(source, position) 291 if ( 292 current is not None 293 and isinstance(current, Parent) 294 and current.children is not None 295 and elem is not None 296 ): 297 current.append(elem) 298 299 if len(self.tag_stack) > 0: 300 raise Exception( 301 f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}", 302 ) 303 return current
def
strip(data: str, cur_tags: list[str]) -> str:
19def strip(data: str, cur_tags: list[str]) -> str: 20 """This function takes a possibly multiline string and strips leading and trailing 21 blank lines. Given the current tag stack it will not strip the text if it is nested 22 in a `pre` tag. 23 """ 24 if len(cur_tags) > 0 and ( 25 cur_tags[-1] == "python" 26 or cur_tags[-1] == "script" 27 or cur_tags[-1] == "style" 28 or "pre" in cur_tags 29 ): 30 return data 31 return data.strip()
This function takes a possibly multiline string and strips leading and trailing
blank lines. Given the current tag stack it will not strip the text if it is nested
in a pre
tag.
class
RE:
58class RE: 59 tag_start = re.compile( 60 r"(?P<comment><!--)|<(?!!--)(?P<opening>!|\/)?(?P<name>([\w:\.]+\-?)+)|<(?P<opening2>/)?(?=\s+>|>)", 61 ) 62 """Matches the start of a tag `<!name|</name|<name`""" 63 64 tag_end = re.compile(r"(?P<closing>/?)>") 65 """Matches the end of a tag `/>|>`.""" 66 67 comment = re.compile(r"<!--((?:.|\s)*)-->") 68 """Matches all html style comments `<!--Comment-->`.""" 69 comment_close = re.compile(r"-->") 70 71 attribute = re.compile( 72 r"(?P<name>[\w:\-@]+)(?:=(?P<value>\{(?P<curly>[^\}]*)\/\}|\"(?P<double>[^\"]*)\"|'(?P<single>[^']*)'|(?P<open>[^>'\"\s]+)))?", 73 ) 74 """Matches a tags attributes `attr|attr=value|attr='value'|attr="value"`.""" 75 76 bracket_attributte = re.compile(r"^\s*\{((?:\s|.)*)\/\}\s*$")
class
HypertextMarkupParser:
79class HypertextMarkupParser: 80 """Parse html/xml like source code strings.""" 81 82 tag_stack = [] 83 """Current stack of tags in order of when they are opened.""" 84 in_pre: int = 0 85 """Whether the current element context is inside a pre element.""" 86 87 def __calc_line_col(self, source: str, start: int) -> tuple[int, int]: 88 """Calculate the number of lines and columns that lead to the starting point int he source 89 string. 90 """ 91 source = source[:start] 92 return ( 93 source.count("\n"), 94 len(source.split("\n")[-1]) if len(source.split("\n")) > 0 else 0, 95 ) 96 97 def __calc_col(self, num_lines: int, num_cols: int, init_cols: int) -> int: 98 """Calculate whether the number of columns should be added to the current column or be 99 treated as if it is starting from zero based on whether new lines exist. 100 """ 101 return num_cols if num_lines != 0 else init_cols + num_cols 102 103 def __parse_text(self, text: str, pos: Position) -> Literal | None: 104 """Parse the comments and general text found in the provided source.""" 105 106 if len(text) > 0 and strip(text, self.tag_stack) != "": 107 line, col = self.__calc_line_col(text, len(text)) 108 pos.start.line += line 109 pos.start.column = col 110 111 pos.end.line += line 112 pos.end.column = self.__calc_col(line, col, pos.end.column) 113 return Literal( 114 LiteralType.Text, 115 strip(text, self.tag_stack), 116 position=Position.from_pos(pos), 117 in_pre=self.in_pre > 0, 118 ) 119 120 return None 121 122 def __parse_attributes(self, attrs: str) -> dict[str, Attribute]: 123 """Parse a tags attributes from the text found between the tag start and the tag end. 124 125 Example: 126 `<name (attributes)>` 127 """ 128 attributes = {} 129 for attr in RE.attribute.finditer(attrs): 130 (name, value, _, double, single, no_bracket) = itemgetter( 131 "name", 132 "value", 133 "curly", 134 "double", 135 "single", 136 "open", 137 )(attr.groupdict()) 138 139 value = double or single or no_bracket 140 141 if value in ["yes", "true", None]: 142 value = True 143 elif value in ["no", "false"]: 144 value = False 145 146 attributes[name] = value 147 return attributes 148 149 def __parse_tag(self, source, position: Position): 150 """Parse a tag from the given source. This includes the tag start, attributes and tag end. 151 It will also parse any comments and text from the start of the source to the start of the 152 tag. 153 """ 154 begin = RE.tag_start.search(source) 155 begin = (begin.start(), begin.group(0), begin.groupdict()) 156 157 elem = None 158 if begin[0] > 0: 159 elem = self.__parse_text(source[: begin[0]], position) 160 161 position.end.column = position.start.column + len(begin[1]) 162 source = source[begin[0] + len(begin[1]) :] 163 164 if begin[2]["comment"] is not None: 165 end = RE.comment_close.search(source) 166 if end is None: 167 raise Exception("Comment was not closed") 168 end = (end.start(), end.group(0), end.groupdict()) 169 attributes: dict[str, Attribute] = {"data": source[: end[0]]} 170 else: 171 begin[2]["opening"] = begin[2]["opening"] or begin[2]["opening2"] 172 end = RE.tag_end.search(source) 173 if end is None: 174 raise Exception( 175 f"Expected tag {begin[1]} to be closed with symbol '>'. Was not closed.", 176 ) 177 end = (end.start(), end.group(0), end.groupdict()) 178 if begin[2]["opening"] == "/" and "<" in source[: end[0]]: 179 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 180 position.end.line = position.start.line + line 181 position.end.column = position.end.column + col 182 raise Exception( 183 f"Closing tag {begin[1]!r} was not closed, maybe it is missing a '>' symbol" 184 ) 185 attributes = self.__parse_attributes(source[: end[0]]) 186 187 line, col = self.__calc_line_col(source, end[0] + len(end[1])) 188 position.end.line = position.start.line + line 189 position.end.column = position.end.column + col 190 191 return source[end[0] + len(end[1]) :], begin, attributes, end, elem 192 193 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 194 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 195 196 if auto_closing: 197 return name in self_closing 198 return False # pragma: no cover 199 200 def parse(self, source: str, auto_close: bool = True) -> AST: 201 """Parse a given html or phml string into it's corresponding phml ast. 202 203 Args: 204 source (str): The html or phml source to parse. 205 206 Returns: 207 AST: A phml AST representing the parsed code source. 208 """ 209 210 self.tag_stack = [] 211 current = AST() 212 position = Position((0, 0), (0, 0)) 213 214 while RE.tag_start.search(source) is not None and current is not None: 215 source, begin, attr, end, elem = self.__parse_tag(source, position) 216 217 if elem is not None: 218 current.append(elem) 219 220 if begin[2]["comment"] is not None: 221 current.append( 222 Literal( 223 LiteralType.Comment, 224 str(attr["data"]), 225 position=Position.from_pos(position), 226 in_pre=self.in_pre > 0, 227 ), 228 ) 229 else: 230 name = begin[2]["name"] or "" 231 if begin[2]["opening"] == "/": 232 if len(self.tag_stack) == 0: 233 raise Exception( 234 f"Unbalanced tags: Tag was closed without first being opened at {position}", 235 ) 236 elif name != self.tag_stack[-1]: 237 print("Tag Stack", self.tag_stack) 238 raise Exception( 239 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}", 240 ) 241 242 ptag = self.tag_stack.pop() 243 if ptag == "pre": 244 self.in_pre -= 1 245 246 if current.position is not None: 247 current.position.end.line = position.end.line 248 current.position.end.column = position.end.column 249 250 current = current.parent 251 elif begin[2]["opening"] == "!": 252 current.append( 253 Element( 254 "doctype", 255 {"lang": attr.get("lang", "html")}, 256 position=Position.from_pos(position), 257 ), 258 ) 259 elif ( 260 end[2]["closing"] != "/" 261 and not self.is_self_closing(name, auto_close) 262 and begin[2]["opening"] is None 263 ): 264 self.tag_stack.append(name) 265 if name == "pre": 266 self.in_pre += 1 267 current.append( 268 Element( 269 name, 270 attr, 271 [], 272 position=Position.from_pos(position), 273 in_pre=self.in_pre > 0, 274 ), 275 ) 276 if len(current) > 0: 277 current = current[-1] 278 else: 279 current.append( 280 Element( 281 name, 282 attr, 283 position=deepcopy(position), 284 in_pre=self.in_pre > 0, 285 ), 286 ) 287 288 position.start = Point(position.end.line, position.end.column) 289 290 if len(source) > 0: 291 elem = self.__parse_text(source, position) 292 if ( 293 current is not None 294 and isinstance(current, Parent) 295 and current.children is not None 296 and elem is not None 297 ): 298 current.append(elem) 299 300 if len(self.tag_stack) > 0: 301 raise Exception( 302 f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}", 303 ) 304 return current
Parse html/xml like source code strings.
def
is_self_closing(self, name: str, auto_closing: bool) -> bool:
193 def is_self_closing(self, name: str, auto_closing: bool) -> bool: 194 """Check if the tag is self closing. Only check if auto_closing is toggled on.""" 195 196 if auto_closing: 197 return name in self_closing 198 return False # pragma: no cover
Check if the tag is self closing. Only check if auto_closing is toggled on.
200 def parse(self, source: str, auto_close: bool = True) -> AST: 201 """Parse a given html or phml string into it's corresponding phml ast. 202 203 Args: 204 source (str): The html or phml source to parse. 205 206 Returns: 207 AST: A phml AST representing the parsed code source. 208 """ 209 210 self.tag_stack = [] 211 current = AST() 212 position = Position((0, 0), (0, 0)) 213 214 while RE.tag_start.search(source) is not None and current is not None: 215 source, begin, attr, end, elem = self.__parse_tag(source, position) 216 217 if elem is not None: 218 current.append(elem) 219 220 if begin[2]["comment"] is not None: 221 current.append( 222 Literal( 223 LiteralType.Comment, 224 str(attr["data"]), 225 position=Position.from_pos(position), 226 in_pre=self.in_pre > 0, 227 ), 228 ) 229 else: 230 name = begin[2]["name"] or "" 231 if begin[2]["opening"] == "/": 232 if len(self.tag_stack) == 0: 233 raise Exception( 234 f"Unbalanced tags: Tag was closed without first being opened at {position}", 235 ) 236 elif name != self.tag_stack[-1]: 237 print("Tag Stack", self.tag_stack) 238 raise Exception( 239 f"Unbalanced tags: {name!r} | {self.tag_stack[-1]!r} at {position}", 240 ) 241 242 ptag = self.tag_stack.pop() 243 if ptag == "pre": 244 self.in_pre -= 1 245 246 if current.position is not None: 247 current.position.end.line = position.end.line 248 current.position.end.column = position.end.column 249 250 current = current.parent 251 elif begin[2]["opening"] == "!": 252 current.append( 253 Element( 254 "doctype", 255 {"lang": attr.get("lang", "html")}, 256 position=Position.from_pos(position), 257 ), 258 ) 259 elif ( 260 end[2]["closing"] != "/" 261 and not self.is_self_closing(name, auto_close) 262 and begin[2]["opening"] is None 263 ): 264 self.tag_stack.append(name) 265 if name == "pre": 266 self.in_pre += 1 267 current.append( 268 Element( 269 name, 270 attr, 271 [], 272 position=Position.from_pos(position), 273 in_pre=self.in_pre > 0, 274 ), 275 ) 276 if len(current) > 0: 277 current = current[-1] 278 else: 279 current.append( 280 Element( 281 name, 282 attr, 283 position=deepcopy(position), 284 in_pre=self.in_pre > 0, 285 ), 286 ) 287 288 position.start = Point(position.end.line, position.end.column) 289 290 if len(source) > 0: 291 elem = self.__parse_text(source, position) 292 if ( 293 current is not None 294 and isinstance(current, Parent) 295 and current.children is not None 296 and elem is not None 297 ): 298 current.append(elem) 299 300 if len(self.tag_stack) > 0: 301 raise Exception( 302 f"The following tags where expected to be closed: {', '.join(repr(tag) for tag in self.tag_stack)}", 303 ) 304 return current
Parse a given html or phml string into it's corresponding phml ast.
Args
- source (str): The html or phml source to parse.
Returns
AST: A phml AST representing the parsed code source.