Skip to content

Commit

Permalink
Phase 2 of parser update: inlines (#3001)
Browse files Browse the repository at this point in the history
See <#3001>
  • Loading branch information
tabatkins authored Jan 9, 2025
1 parent ab9fa96 commit 8e3b2f4
Show file tree
Hide file tree
Showing 501 changed files with 14,299 additions and 6,874 deletions.
38 changes: 37 additions & 1 deletion bikeshed/Spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,11 @@ def initMetadata(self, inputContent: InputSource.InputContent) -> None:
m.retroactivelyCheckErrorLevel()

def earlyParse(self, inputContent: InputSource.InputContent) -> list[l.Line]:
text = h.strFromNodes(h.initialDocumentParse(inputContent.content, h.ParseConfig.fromSpec(self)), withIlcc=True)
text = FIXMEreplaceMarkdownBlockquotes(inputContent.content)
nodes = h.initialDocumentParse(text, h.ParseConfig.fromSpec(self))
if self.debug:
h.debugNodes(nodes)
text = h.strFromNodes(nodes, withIlcc=True)
inputContent.rawLines = [x + "\n" for x in text.split("\n")]
return inputContent.lines

Expand Down Expand Up @@ -587,3 +591,35 @@ def checkForMixedIndents(lines: t.Sequence[l.Line], info: metadata.IndentInfo) -
m.lint(f"Your document appears to use tabs to indent, but line {line.i} starts with spaces.")
if re.match(r"(\t+ +\t)|( +\t)", line.text):
m.lint(f"Line {line.i}'s indent contains tabs after spaces.")


def FIXMEreplaceMarkdownBlockquotes(text: str) -> str:
# Temporary hack to make the early HTML pass not be broken
# by Markdown blockquotes.
# * finds sequences of lines that share a ws + angle bracket prefix
# * replaces first such > with a blockquote-open PUA
# * replaces subsequent > with a space
# * adds a blockquote-close PUA to the end of last line
# * the HTML parser recognizes those PUAs and emits the correct start/end tags.

lines = text.split("\n")
i = 0
while True:
if i >= len(lines):
break
match = re.match(r"\s*>\s?", lines[i])
if not match:
i += 1
continue
if i + 1 < len(lines) and re.match(r"\s*>\s?", lines[i + 1]):
lines[i] = constants.bqStart + lines[i][len(match[0]) :]
i += 1
while i < len(lines) and re.match(r"\s*>\s?", lines[i]):
match = re.match(r"\s*>\s?", lines[i])
assert match is not None
lines[i] = lines[i][len(match[0]) :]
i += 1
lines[i - 1] += constants.bqEnd
else:
i += 1
return "\n".join(lines)
1 change: 1 addition & 0 deletions bikeshed/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
intersperse,
processTextNodes,
reSubObject,
safeIndex,
scriptPath,
simplifyText,
splitForValues,
Expand Down
2 changes: 1 addition & 1 deletion bikeshed/config/dfnTypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
dfnTypes = frozenset(list(dfnClassToType.values()) + ["dfn"])
maybeTypes = frozenset(["value", "type", "at-rule", "function", "selector"])
cssTypes = frozenset(["property", "value", "at-rule", "descriptor", "type", "function", "selector"])
markupTypes = frozenset(["element", "element-attr", "element-state", "attr-value"])
markupTypes = frozenset(["element", "element-attr", "element-state", "attr-value", "element-sub"])
idlTypes = frozenset(
[
"event",
Expand Down
11 changes: 11 additions & 0 deletions bikeshed/config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,14 @@ def doEvery(s: float, action: t.Callable, lastTime: float | None = None) -> floa
action()
return newTime
return lastTime


if t.TYPE_CHECKING:
SafeIndexDefaultT = t.TypeVar("SafeIndexDefaultT")


def safeIndex(coll: t.Sequence, needle: t.Any, default: SafeIndexDefaultT = None) -> int | SafeIndexDefaultT: # type: ignore[assignment]
try:
return coll.index(needle)
except ValueError:
return default
2 changes: 2 additions & 0 deletions bikeshed/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,5 @@
incrementLineCountChar = "\uebbd"
decrementLineCountChar = "\uebbf"
bsComment = "<!--\uebbe-->"
bqStart = "\uebc0"
bqEnd = "\uebc1"
48 changes: 34 additions & 14 deletions bikeshed/datablocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def transformPre(lines: list[str], tagName: str, firstLine: str, lineNum: int |


def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
rows = parseDefBlock(lines, "simpledef", capitalizeKeys=False, doc=doc)
rows = parseDefBlock(lines, "simpledef", capitalizeKeys=False, doc=doc, lineNum=lineNum)
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" bs-line-number={lineNum}"
Expand All @@ -268,7 +268,7 @@ def transformSimpleDef(lines: list[str], tagName: str, firstLine: str, lineNum:

def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "propdef", doc=doc)
parsedAttrs = parseDefBlock(lines, "propdef", doc=doc, lineNum=lineNum)
# Displays entries in the order specified in attrs,
# then if there are any unknown parsedAttrs values,
# they're displayed afterward in the order they were specified.
Expand Down Expand Up @@ -326,7 +326,10 @@ def transformPropdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
val = parsedAttrs[key]
elif val is None:
# Required key, not provided
m.die(f"The propdef for '{parsedAttrs.get('Name', '???')}' is missing a '{key}' line.", lineNum=lineNum)
if "Name" in parsedAttrs:
m.die(f"The propdef for '{parsedAttrs.get('Name', '')}' is missing a '{key}' line.", lineNum=lineNum)
else:
m.die(f"The propdef block is missing a '{key}' line.", lineNum=lineNum)
continue
else:
# Optional key, just use default
Expand Down Expand Up @@ -380,7 +383,7 @@ def transformDescdef(lines: list[str], tagName: str, firstLine: str, lineNum: in
lineNumAttr = ""
if lineNum is not None:
lineNumAttr = f" bs-line-number={lineNum}"
vals = parseDefBlock(lines, "descdef", doc=doc)
vals = parseDefBlock(lines, "descdef", doc=doc, lineNum=lineNum)
if "partial" in firstLine or "New values" in vals:
requiredKeys = ["Name", "For"]
ret = [
Expand Down Expand Up @@ -432,7 +435,7 @@ def transformElementdef(lines: list[str], tagName: str, firstLine: str, lineNum:
if lineNum is not None:
lineNumAttr = f" bs-line-number={lineNum}"
attrs: OrderedDict[str, str | None] = OrderedDict()
parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc)
parsedAttrs = parseDefBlock(lines, "elementdef", doc=doc, lineNum=lineNum)
if "Attribute groups" in parsedAttrs or "Attributes" in parsedAttrs:
html = "<ul>"
if "Attribute groups" in parsedAttrs:
Expand Down Expand Up @@ -587,15 +590,17 @@ def parseDefBlock(
capitalizeKeys: bool = True,
lineNum: int | None = None,
) -> OrderedDict[str, str]:
vals: OrderedDict[str, str] = OrderedDict()
vals: OrderedDict[str, tuple[int, str]] = OrderedDict()
if lineNum is None:
lineNum = 0
lastKey = None
for line in lines:
for li, line in enumerate(lines, lineNum + 1):
if "<!--" in line:
commentMatch = re.match(r"(.*)<!--.*?-->(.*)", line)
if not commentMatch:
m.die(
f"Detected the start of a comment on a line, but couldn't find the end. Please remove the comment, or keep it on a single line:\n{line}",
lineNum=lineNum,
lineNum=li,
)
continue
# Just pull the comment out, and continue
Expand All @@ -609,7 +614,7 @@ def parseDefBlock(
key = lastKey
val = line.strip()
else:
m.die(f"Incorrectly formatted {type} line for '{vals.get('Name', '???')}':\n{line}", lineNum=lineNum)
m.die(f"Incorrectly formatted {type} line for '{vals.get('Name', '???')}':\n{line}", lineNum=li)
continue
else:
key = match.group(1).strip()
Expand All @@ -618,12 +623,27 @@ def parseDefBlock(
lastKey = key
val = (match.group(2) or "").strip()
if key in vals:
vals[key] += "\n" + val
vals[key] = (vals[key][0], vals[key][1] + "\n" + val)
else:
vals[key] = val
for key, val in vals.items():
vals[key] = h.parseText(val, h.ParseConfig.fromSpec(doc))
return vals
vals[key] = (lineNum, val)
retVals: OrderedDict[str, str] = OrderedDict()
for key, (li, val) in vals.items():
context = f"'{key}' key in (line {lineNum}) {type}"
pConfig = h.ParseConfig.fromSpec(doc, context=context)
if type in ("propdef", "descdef") and key == "Name":
newVal = ""
for node in h.nodesFromHtml(val, pConfig, startLine=1):
if isinstance(node, h.parser.Text):
newVal += str(node)
else:
m.die(
f"'Name' key should contain just the property/descriptor name, or a comma-separated list. Found markup:\n {val}",
lineNum=li,
)
retVals[key] = newVal
else:
retVals[key] = h.parseText(val, pConfig, startLine=1)
return retVals


def transformRailroad(lines: list[str], tagName: str, firstLine: str, lineNum: int | None, doc: t.SpecT) -> list[str]:
Expand Down
2 changes: 2 additions & 0 deletions bikeshed/h/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from . import parser
from .dom import (
DuplicatedLinkText,
E,
Expand Down Expand Up @@ -81,6 +82,7 @@
ParseConfig,
debugNodes,
initialDocumentParse,
nodesFromHtml,
parseLines,
parseText,
parseTitle,
Expand Down
91 changes: 59 additions & 32 deletions bikeshed/h/dom.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,12 @@ def serializeTag(el: t.ElementT) -> str:
# Use when you want to output the HTML,
# but it might be a container with a lot of content.
tag = "<" + el.tag
for n, v in el.attrib.items():
for n, v in sorted(el.attrib.items()):
# Don't output the bs-* attributes, they're added by BS
# and don't show up in the source, so it's confusing to
# print them.
if t.cast(str, n).startswith("bs-"):
continue
tag += ' {n}="{v}"'.format(n=str(n), v=escapeAttr(str(v)))
tag += ">"
return tag
Expand Down Expand Up @@ -392,6 +397,10 @@ def insertAfter(target: t.ElementT, *els: t.NodesT) -> t.ElementT:


def removeNode(node: t.ElementT) -> t.ElementT:
# Kills the node *and* its children.
# If you just want to remove the node itself, and lift
# the contents up into its place, you wanna call
# replaceWithContents()
parent = node.getparent()
if parent is None:
return node
Expand Down Expand Up @@ -529,7 +538,12 @@ def ancestorElements(el: t.ElementT, self: bool = False) -> t.Generator[t.Elemen
yield from el.iterancestors()


def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool = True) -> list[t.NodeT]:
def childNodes(
parentEl: t.ElementishT,
clear: bool = False,
skipOddNodes: bool = True,
mergeText: bool = False,
) -> list[t.NodeT]:
"""
This function returns all the nodes in a parent element in the DOM sense,
mixing text nodes (strings) and other nodes together
Expand All @@ -543,7 +557,7 @@ def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool
In other words, the following is a no-op:
```
appendChild(parentEl, *childNodes(parentEl, clear=True), allowEmpty=True)
appendChild(parentEl, *childNodes(parentEl, clear=True, skipOddNodes=False), allowEmpty=True)
```
Using clear=True is required if you're going to be modifying the element or its children,
Expand All @@ -553,37 +567,49 @@ def childNodes(parentEl: t.ElementishT, clear: bool = False, skipOddNodes: bool
skipOddNodes ensures that the return value will only be text and Element nodes;
if it's false, there might be comments, PIs, etc.
mergeText merges adjacent text nodes in the output, even if they weren't adjacent
in the source document (such as if a skipped odd node separated them).
"""
ret: list[t.NodeT] = []

def append(nodes: list[t.NodeT], node: t.NodeT) -> None:
if not mergeText:
nodes.append(node)
return
if isinstance(node, str) and len(nodes) > 0 and isinstance(nodes[-1], str):
nodes[-1] += node
else:
nodes.append(node)

if isinstance(parentEl, list):
for c in parentEl:
if isinstance(c, str):
ret.append(c)
append(ret, c)
continue
if skipOddNodes and isOddNode(c):
pass
else:
ret.append(c)
append(ret, c)
if not emptyText(c.tail, wsAllowed=False):
ret.append(t.cast(str, c.tail))
append(ret, t.cast(str, c.tail))
if clear:
c.tail = None
if clear:
parentEl[:] = []
return ret

if not emptyText(parentEl.text, wsAllowed=False):
ret.append(t.cast(str, parentEl.text))
append(ret, t.cast(str, parentEl.text))
if clear:
parentEl.text = None
for c in childElements(parentEl, oddNodes=True):
if skipOddNodes and isOddNode(c):
pass
else:
ret.append(c)
append(ret, c)
if not emptyText(c.tail, wsAllowed=False):
ret.append(t.cast(str, c.tail))
append(ret, t.cast(str, c.tail))
if clear:
c.tail = None
if clear:
Expand Down Expand Up @@ -910,8 +936,6 @@ def addOldIDs(els: t.Iterable[t.ElementT]) -> None:


def dedupIDs(doc: t.SpecT) -> None:
import itertools as iter

ids: OrderedDict[str, list[t.ElementT]] = OrderedDict()
for el in findAll("[id]", doc):
ids.setdefault(t.cast(str, el.get("id")), []).append(el)
Expand All @@ -923,41 +947,44 @@ def dedupIDs(doc: t.SpecT) -> None:
if re.match(r"issue-[0-9a-fA-F]{8}$", dupeId):
# Don't warn about issues, it's okay if they have the same ID because they're identical text.
warnAboutDupes = False
ints = iter.count(1)
if dupeId.startswith("ref-for-"):
warnAboutDupes = False
complaintEls = []
if warnAboutDupes:
for el in els:
if el.get("data-silently-dedup") is not None:
continue
complaintEls.append(el)
# Now dedup everything left in the list after the first one.
dedupIndex = 1
for el in els[1:]:
# If I registered an alternate ID, try to use that.
if el.get("data-alternate-id"):
altId = el.get("data-alternate-id")
assert altId is not None
if altId not in ids:
el.set("id", safeID(doc, altId))
ids.setdefault(altId, []).append(el)
ids[altId] = [el]
complaintEls.remove(el)
continue
if el.get("data-silently-dedup") is not None:
warnAboutDupes = False
if dupeId.startswith("ref-for-"):
warnAboutDupes = False
# Try to de-dup the id by appending an integer after it.
if warnAboutDupes:
warn(
f"Multiple elements have the same ID '{dupeId}'.\nDeduping, but this ID may not be stable across revisions.",
el=el,
)
for x in ints:
altId = "{}{}".format(dupeId, circledDigits(x))
if altId not in ids:
el.set("id", safeID(doc, altId))
ids.setdefault(altId, []).append(el)
break
altId = f"{dupeId}{circledDigits(dedupIndex)}"
while altId in ids:
dedupIndex += 1
altId = f"{dupeId}{circledDigits(dedupIndex)}"
el.set("id", safeID(doc, altId))
ids[altId] = [el]

if len(complaintEls) > 1:
complaintDetails = [f"<{el.tag}> on line {approximateLineNumber(el)}" for el in complaintEls]
warn(
f"Multiple elements have the same id '{dupeId}':\n {', '.join(complaintDetails)}\nDeduping, but this ID may not be stable across revisions.",
)


def approximateLineNumber(el: t.ElementT, setIntermediate: bool = True) -> str | None:
if el.get("bs-line-number"):
return el.get("bs-line-number")
parent = parentElement(el)
if not isElement(parent):
if el.tag == "html":
return None
return None
approx = approximateLineNumber(parent, setIntermediate=setIntermediate)
if approx is None:
Expand Down
1 change: 1 addition & 0 deletions bikeshed/h/parser/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def linesFromNodes(nodes: t.Iterable[ParserNode]) -> list[str]:
def debugNodes(nodes: t.Iterable[ParserNode]) -> list[ParserNode]:
nodes = list(nodes)
for node in nodes:
print("------") # noqa: T201
print(repr(node)) # noqa: T201
print(repr(strFromNodes([node], withIlcc=True))) # noqa: T201
return nodes
Expand Down
Loading

0 comments on commit 8e3b2f4

Please sign in to comment.