Phase 2 of parser update: inlines (#3001)

See <#3001>
speced · Jan 9, 2025 · 8e3b2f4 · 8e3b2f4
1 parent ab9fa96
commit 8e3b2f4
Show file tree

Hide file tree

Showing 501 changed files with 14,299 additions and 6,874 deletions.
diff --git a/bikeshed/Spec.py b/bikeshed/Spec.py
@@ -185,7 +185,11 @@ def initMetadata(self, inputContent: InputSource.InputContent) -> None:
         m.retroactivelyCheckErrorLevel()
 
     def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
-        text = h.strFromNodes(h.initialDocumentParse(inputContent.content, h.ParseConfig.fromSpec(self)), withIlcc=True)
+        text = FIXMEreplaceMarkdownBlockquotes(inputContent.content)
+        nodes = h.initialDocumentParse(text, h.ParseConfig.fromSpec(self))
+        if self.debug:
+            h.debugNodes(nodes)
+        text = h.strFromNodes(nodes, withIlcc=True)
         inputContent.rawLines = [x + "\n" for x in text.split("\n")]
         return inputContent.lines
 
@@ -587,3 +591,35 @@ def checkForMixedIndents(lines: t.Sequence[l.Line], info: metadata.IndentInfo) -
                 m.lint(f"Your document appears to use tabs to indent, but line {line.i} starts with spaces.")
         if re.match(r"(\t+ +\t)|( +\t)", line.text):
             m.lint(f"Line {line.i}'s indent contains tabs after spaces.")
+
+
+def FIXMEreplaceMarkdownBlockquotes(text: str) -> str:
+    # Temporary hack to make the early HTML pass not be broken
+    # by Markdown blockquotes.
+    # * finds sequences of lines that share a ws + angle bracket prefix
+    # * replaces first such > with a blockquote-open PUA
+    # * replaces subsequent > with a space
+    # * adds a blockquote-close PUA to the end of last line
+    # * the HTML parser recognizes those PUAs and emits the correct start/end tags.
+
+    lines = text.split("\n")
+    i = 0
+    while True:
+        if i >= len(lines):
+            break
+        match = re.match(r"\s*>\s?", lines[i])
+        if not match:
+            i += 1
+            continue
+        if i + 1 < len(lines) and re.match(r"\s*>\s?", lines[i + 1]):
+            lines[i] = constants.bqStart + lines[i][len(match[0]) :]
+            i += 1
+            while i < len(lines) and re.match(r"\s*>\s?", lines[i]):
+                match = re.match(r"\s*>\s?", lines[i])
+                assert match is not None
+                lines[i] = lines[i][len(match[0]) :]
+                i += 1
+            lines[i - 1] += constants.bqEnd
+        else:
+            i += 1
+    return "\n".join(lines)
diff --git a/bikeshed/config/__init__.py b/bikeshed/config/__init__.py
@@ -32,6 +32,7 @@
     intersperse,
     processTextNodes,
     reSubObject,
+    safeIndex,
     scriptPath,
     simplifyText,
     splitForValues,

diff --git a/bikeshed/config/dfnTypes.py b/bikeshed/config/dfnTypes.py
@@ -53,7 +53,7 @@
 dfnTypes = frozenset(list(dfnClassToType.values()) + ["dfn"])
 maybeTypes = frozenset(["value", "type", "at-rule", "function", "selector"])
 cssTypes = frozenset(["property", "value", "at-rule", "descriptor", "type", "function", "selector"])
-markupTypes = frozenset(["element", "element-attr", "element-state", "attr-value"])
+markupTypes = frozenset(["element", "element-attr", "element-state", "attr-value", "element-sub"])
 idlTypes = frozenset(
     [
         "event",

diff --git a/bikeshed/config/main.py b/bikeshed/config/main.py
@@ -192,3 +192,14 @@ def doEvery(s: float, action: t.Callable, lastTime: float | None = None) -> floa
         action()
         return newTime
     return lastTime
+
+
+if t.TYPE_CHECKING:
+    SafeIndexDefaultT = t.TypeVar("SafeIndexDefaultT")
+
+
+def safeIndex(coll: t.Sequence, needle: t.Any, default: SafeIndexDefaultT = None) -> int | SafeIndexDefaultT:  # type: ignore[assignment]
+    try:
+        return coll.index(needle)
+    except ValueError:
+        return default
diff --git a/bikeshed/constants.py b/bikeshed/constants.py
@@ -13,3 +13,5 @@
 incrementLineCountChar = "\uebbd"
 decrementLineCountChar = "\uebbf"
 bsComment = "<!--\uebbe-->"
+bqStart = "\uebc0"
+bqEnd = "\uebc1"
diff --git a/bikeshed/datablocks.py b/bikeshed/datablocks.py
@@ -251,7 +251,7 @@ def transformPre(lines: list[str], tagName: str, firstLine: str, lineNum: int |
 
 
 def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
-    rows = parseDefBlock(lines, "simpledef", capitalizeKeys=False, doc=doc)
+    rows = parseDefBlock(lines, "simpledef", capitalizeKeys=False, doc=doc, lineNum=lineNum)
     lineNumAttr = ""
     if lineNum is not None:
         lineNumAttr = f" bs-line-number={lineNum}"
@@ -268,7 +268,7 @@ def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum:
 
 def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
     attrs: OrderedDict[str, str | None] = OrderedDict()
-    parsedAttrs = parseDefBlock(lines, "propdef", doc=doc)
+    parsedAttrs = parseDefBlock(lines, "propdef", doc=doc, lineNum=lineNum)
     # Displays entries in the order specified in attrs,
     # then if there are any unknown parsedAttrs values,
     # they're displayed afterward in the order they were specified.
@@ -326,7 +326,10 @@ def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
             val = parsedAttrs[key]
         elif val is None:
             # Required key, not provided
-            m.die(f"The propdef for '{parsedAttrs.get('Name', '???')}' is missing a '{key}' line.", lineNum=lineNum)
+            if "Name" in parsedAttrs:
+                m.die(f"The propdef for '{parsedAttrs.get('Name', '')}' is missing a '{key}' line.", lineNum=lineNum)
+            else:
+                m.die(f"The propdef block is missing a '{key}' line.", lineNum=lineNum)
             continue
         else:
             # Optional key, just use default
@@ -380,7 +383,7 @@ def transformDescdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
     lineNumAttr = ""
     if lineNum is not None:
         lineNumAttr = f" bs-line-number={lineNum}"
-    vals = parseDefBlock(lines, "descdef", doc=doc)
+    vals = parseDefBlock(lines, "descdef", doc=doc, lineNum=lineNum)
     if "partial" in firstLine or "New values" in vals:
         requiredKeys = ["Name", "For"]
         ret = [
@@ -432,7 +435,7 @@ def transformElementdef(lines: list[str], tagName: str, firstLine: str, lineNum:
     if lineNum is not None:
         lineNumAttr = f" bs-line-number={lineNum}"
     attrs: OrderedDict[str, str | None] = OrderedDict()
-    parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc)
+    parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc, lineNum=lineNum)
     if "Attribute groups" in parsedAttrs or "Attributes" in parsedAttrs:
         html = "<ul>"
         if "Attribute groups" in parsedAttrs:
@@ -587,15 +590,17 @@ def parseDefBlock(
     capitalizeKeys: bool = True,
     lineNum: int | None = None,
 ) -> OrderedDict[str, str]:
-    vals: OrderedDict[str, str] = OrderedDict()
+    vals: OrderedDict[str, tuple[int, str]] = OrderedDict()
+    if lineNum is None:
+        lineNum = 0
     lastKey = None
-    for line in lines:
+    for li, line in enumerate(lines, lineNum + 1):
         if "<!--" in line:
             commentMatch = re.match(r"(.*)<!--.*?-->(.*)", line)
             if not commentMatch:
                 m.die(
                     f"Detected the start of a comment on a line, but couldn't find the end. Please remove the comment, or keep it on a single line:\n{line}",
-                    lineNum=lineNum,
+                    lineNum=li,
                 )
                 continue
             # Just pull the comment out, and continue
@@ -609,7 +614,7 @@ def parseDefBlock(
                 key = lastKey
                 val = line.strip()
             else:
-                m.die(f"Incorrectly formatted {type} line for '{vals.get('Name', '???')}':\n{line}", lineNum=lineNum)
+                m.die(f"Incorrectly formatted {type} line for '{vals.get('Name', '???')}':\n{line}", lineNum=li)
                 continue
         else:
             key = match.group(1).strip()
@@ -618,12 +623,27 @@ def parseDefBlock(
             lastKey = key
             val = (match.group(2) or "").strip()
         if key in vals:
-            vals[key] += "\n" + val
+            vals[key] = (vals[key][0], vals[key][1] + "\n" + val)
         else:
-            vals[key] = val
-    for key, val in vals.items():
-        vals[key] = h.parseText(val, h.ParseConfig.fromSpec(doc))
-    return vals
+            vals[key] = (lineNum, val)
+    retVals: OrderedDict[str, str] = OrderedDict()
+    for key, (li, val) in vals.items():
+        context = f"'{key}' key in (line {lineNum}) {type}"
+        pConfig = h.ParseConfig.fromSpec(doc, context=context)
+        if type in ("propdef", "descdef") and key == "Name":
+            newVal = ""
+            for node in h.nodesFromHtml(val, pConfig, startLine=1):
+                if isinstance(node, h.parser.Text):
+                    newVal += str(node)
+                else:
+                    m.die(
+                        f"'Name' key should contain just the property/descriptor name, or a comma-separated list. Found markup:\n  {val}",
+                        lineNum=li,
+                    )
+            retVals[key] = newVal
+        else:
+            retVals[key] = h.parseText(val, pConfig, startLine=1)
+    return retVals
 
 
 def transformRailroad(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:

diff --git a/bikeshed/h/__init__.py b/bikeshed/h/__init__.py
@@ -1,3 +1,4 @@
+from . import parser
 from .dom import (
     DuplicatedLinkText,
     E,
@@ -81,6 +82,7 @@
     ParseConfig,
     debugNodes,
     initialDocumentParse,
+    nodesFromHtml,
     parseLines,
     parseText,
     parseTitle,

diff --git a/bikeshed/h/dom.py b/bikeshed/h/dom.py
@@ -225,7 +225,12 @@ def serializeTag(el: t.ElementT) -> str:
     # Use when you want to output the HTML,
     # but it might be a container with a lot of content.
     tag = "<" + el.tag
-    for n, v in el.attrib.items():
+    for n, v in sorted(el.attrib.items()):
+        # Don't output the bs-* attributes, they're added by BS
+        # and don't show up in the source, so it's confusing to
+        # print them.
+        if t.cast(str, n).startswith("bs-"):
+            continue
         tag += ' {n}="{v}"'.format(n=str(n), v=escapeAttr(str(v)))
     tag += ">"
     return tag
@@ -392,6 +397,10 @@ def insertAfter(target: t.ElementT, *els: t.NodesT) -> t.ElementT:
 
 
 def removeNode(node: t.ElementT) -> t.ElementT:
+    # Kills the node *and* its children.
+    # If you just want to remove the node itself, and lift
+    # the contents up into its place, you wanna call
+    # replaceWithContents()
     parent = node.getparent()
     if parent is None:
         return node
@@ -529,7 +538,12 @@ def ancestorElements(el: t.ElementT, self: bool = False) -> t.Generator[t.Elemen
     yield from el.iterancestors()
 
 
-def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool = True) -> list[t.NodeT]:
+def childNodes(
+    parentEl: t.ElementishT,
+    clear: bool = False,
+    skipOddNodes: bool = True,
+    mergeText: bool = False,
+) -> list[t.NodeT]:
     """
     This function returns all the nodes in a parent element in the DOM sense,
     mixing text nodes (strings) and other nodes together
@@ -543,7 +557,7 @@ def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool
     In other words, the following is a no-op:
 
     ```
-    appendChild(parentEl, *childNodes(parentEl, clear=True), allowEmpty=True)
+    appendChild(parentEl, *childNodes(parentEl, clear=True, skipOddNodes=False), allowEmpty=True)
     ```
 
     Using clear=True is required if you're going to be modifying the element or its children,
@@ -553,37 +567,49 @@ def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool
 
     skipOddNodes ensures that the return value will only be text and Element nodes;
     if it's false, there might be comments, PIs, etc.
+
+    mergeText merges adjacent text nodes in the output, even if they weren't adjacent
+    in the source document (such as if a skipped odd node separated them).
     """
     ret: list[t.NodeT] = []
 
+    def append(nodes: list[t.NodeT], node: t.NodeT) -> None:
+        if not mergeText:
+            nodes.append(node)
+            return
+        if isinstance(node, str) and len(nodes) > 0 and isinstance(nodes[-1], str):
+            nodes[-1] += node
+        else:
+            nodes.append(node)
+
     if isinstance(parentEl, list):
         for c in parentEl:
             if isinstance(c, str):
-                ret.append(c)
+                append(ret, c)
                 continue
             if skipOddNodes and isOddNode(c):
                 pass
             else:
-                ret.append(c)
+                append(ret, c)
             if not emptyText(c.tail, wsAllowed=False):
-                ret.append(t.cast(str, c.tail))
+                append(ret, t.cast(str, c.tail))
                 if clear:
                     c.tail = None
         if clear:
             parentEl[:] = []
         return ret
 
     if not emptyText(parentEl.text, wsAllowed=False):
-        ret.append(t.cast(str, parentEl.text))
+        append(ret, t.cast(str, parentEl.text))
         if clear:
             parentEl.text = None
     for c in childElements(parentEl, oddNodes=True):
         if skipOddNodes and isOddNode(c):
             pass
         else:
-            ret.append(c)
+            append(ret, c)
         if not emptyText(c.tail, wsAllowed=False):
-            ret.append(t.cast(str, c.tail))
+            append(ret, t.cast(str, c.tail))
             if clear:
                 c.tail = None
     if clear:
@@ -910,8 +936,6 @@ def addOldIDs(els: t.Iterable[t.ElementT]) -> None:
 
 
 def dedupIDs(doc: t.SpecT) -> None:
-    import itertools as iter
-
     ids: OrderedDict[str, list[t.ElementT]] = OrderedDict()
     for el in findAll("[id]", doc):
         ids.setdefault(t.cast(str, el.get("id")), []).append(el)
@@ -923,41 +947,44 @@ def dedupIDs(doc: t.SpecT) -> None:
         if re.match(r"issue-[0-9a-fA-F]{8}$", dupeId):
             # Don't warn about issues, it's okay if they have the same ID because they're identical text.
             warnAboutDupes = False
-        ints = iter.count(1)
+        if dupeId.startswith("ref-for-"):
+            warnAboutDupes = False
+        complaintEls = []
+        if warnAboutDupes:
+            for el in els:
+                if el.get("data-silently-dedup") is not None:
+                    continue
+                complaintEls.append(el)
+        # Now dedup everything left in the list after the first one.
+        dedupIndex = 1
         for el in els[1:]:
-            # If I registered an alternate ID, try to use that.
             if el.get("data-alternate-id"):
                 altId = el.get("data-alternate-id")
                 assert altId is not None
                 if altId not in ids:
                     el.set("id", safeID(doc, altId))
-                    ids.setdefault(altId, []).append(el)
+                    ids[altId] = [el]
+                    complaintEls.remove(el)
                     continue
-            if el.get("data-silently-dedup") is not None:
-                warnAboutDupes = False
-            if dupeId.startswith("ref-for-"):
-                warnAboutDupes = False
-            # Try to de-dup the id by appending an integer after it.
-            if warnAboutDupes:
-                warn(
-                    f"Multiple elements have the same ID '{dupeId}'.\nDeduping, but this ID may not be stable across revisions.",
-                    el=el,
-                )
-            for x in ints:
-                altId = "{}{}".format(dupeId, circledDigits(x))
-                if altId not in ids:
-                    el.set("id", safeID(doc, altId))
-                    ids.setdefault(altId, []).append(el)
-                    break
+            altId = f"{dupeId}{circledDigits(dedupIndex)}"
+            while altId in ids:
+                dedupIndex += 1
+                altId = f"{dupeId}{circledDigits(dedupIndex)}"
+            el.set("id", safeID(doc, altId))
+            ids[altId] = [el]
+
+        if len(complaintEls) > 1:
+            complaintDetails = [f"<{el.tag}> on line {approximateLineNumber(el)}" for el in complaintEls]
+            warn(
+                f"Multiple elements have the same id '{dupeId}':\n  {', '.join(complaintDetails)}\nDeduping, but this ID may not be stable across revisions.",
+            )
 
 
 def approximateLineNumber(el: t.ElementT, setIntermediate: bool = True) -> str | None:
     if el.get("bs-line-number"):
         return el.get("bs-line-number")
     parent = parentElement(el)
     if not isElement(parent):
-        if el.tag == "html":
-            return None
         return None
     approx = approximateLineNumber(parent, setIntermediate=setIntermediate)
     if approx is None:

diff --git a/bikeshed/h/parser/main.py b/bikeshed/h/parser/main.py
@@ -69,6 +69,7 @@ def linesFromNodes(nodes: t.Iterable[ParserNode]) -> list[str]:
 def debugNodes(nodes: t.Iterable[ParserNode]) -> list[ParserNode]:
     nodes = list(nodes)
     for node in nodes:
+        print("------")  # noqa: T201
         print(repr(node))  # noqa: T201
         print(repr(strFromNodes([node], withIlcc=True)))  # noqa: T201
     return nodes