diff --git a/schemas/browserlib/extract-dfns.json b/schemas/browserlib/extract-dfns.json index 807e00f9..47b0deba 100644 --- a/schemas/browserlib/extract-dfns.json +++ b/schemas/browserlib/extract-dfns.json @@ -63,6 +63,10 @@ }, "definedIn": { "type": "string" + }, + "htmlProse": { + "type": "string", + "minLength": 1 } } } diff --git a/src/browserlib/extract-dfns.mjs b/src/browserlib/extract-dfns.mjs index 93ef9bb5..3a53b883 100644 --- a/src/browserlib/extract-dfns.mjs +++ b/src/browserlib/extract-dfns.mjs @@ -122,6 +122,57 @@ function isNotAlreadyExported(dfn, idx, list) { return first === dfn; } +// Extract the element's inner HTML content, removing any complex structure, +// so that the result can be injected elsewhere without creating problems. +function getHtmlProseDefinition(proseEl) { + // Apply modifications to a copy of the element + proseEl = proseEl.cloneNode(true); + + // Drop asides that authoring tools add here and there + let el; + const asideSelector = [ + 'aside', '.mdn-anno', '.wpt-tests-block', '.annotation', + '[id^=dfn-panel-]' + ].join(','); + while (el = proseEl.querySelector(asideSelector)) { + el.remove(); + } + + // Keep simple grouping content and text-level semantics elements + const keepSelector = [ + 'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li', + 'ol', 'p', 'pre', 'ul', + 'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn', 'em', + 'i', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'span', + 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr' + ].join(','); + while (el = proseEl.querySelector(`:not(${keepSelector})`)) { + // The content is more complex than anticipated. It may be worth checking + // the definition to assess whether the extraction logic needs to become + // smarter. For lack of a better reporting mechanism for now, let's record + // a warning. + console.warn('[reffy]', `Unexpected element "${el.nodeName}" found in textual definition of "${proseEl.getAttribute('data-defines')}"`); + el.remove(); + } + + // Drop all attributes except "href", "dir", "lang" and "title" + // For "href", let's make sure that we have an absolute URL + [...proseEl.querySelectorAll('*')].forEach(el => { + el.getAttributeNames().forEach(attr => { + if (attr === 'href') { + const page = el.closest('[data-reffy-page]')?.getAttribute('data-reffy-page'); + const url = new URL(el.getAttribute('href'), page ?? window.location.href); + el.setAttribute('href', url.toString()); + } + else if (!['dir', 'lang', 'title'].includes(attr)) { + el.removeAttribute(attr); + } + }); + }); + + return proseEl.innerHTML.trim(); +} + function definitionMapper(el, idToHeading, usesDfnDataModel) { let definedIn = 'prose'; const enclosingEl = el.closest('dt,pre,table,h1,h2,h3,h4,h5,h6,.note,.example') || el; @@ -157,7 +208,7 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) { url.hash = '#' + encodeURIComponent(el.getAttribute('id')); const href = url.toString(); - return { + const dfn = { // ID is the id attribute // (ID may not be unique in a multi-page spec) id: el.getAttribute('id'), @@ -211,6 +262,17 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) { // indicates that definition appears in the main body of the specification) definedIn }; + + // Extract a prose definition in HTML for the term, if available + const proseEl = document.querySelector(`[data-defines="#${dfn.id}"]`); + if (proseEl) { + const htmlProse = getHtmlProseDefinition(proseEl); + if (htmlProse) { + dfn.htmlProse = htmlProse; + } + } + + return dfn; } export default function (spec, idToHeading = {}) { diff --git a/tests/extract-dfns.js b/tests/extract-dfns.js index e305ae67..a17b89f7 100644 --- a/tests/extract-dfns.js +++ b/tests/extract-dfns.js @@ -603,6 +603,112 @@ When initialize(newItem) is called, the following steps are run:
` }], spec: "CSS2" }, + + { + title: "extracts the prose that defines a term", + html: `+ Foo enters a bar. +
`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + }, + + { + title: "keeps basic structure for the prose that defines a term", + html: `Foo enters a bar.
+
The bar has 2 baz on tap:
Foo bar baz+
Foo enters a bar.
+
The bar has 2 baz on tap:
Foo bar baz` + }] + }, + + { + title: "keeps useful attributes in prose that defines a term", + html: `
+ Foo enters a bar. +
`, + changesToBaseDfn: [{ + htmlProse: `Foo enters a bar.` + }] + }, + + { + title: "keeps href in prose that defines a term", + html: `+ Foo enters a bar. +
`, + changesToBaseDfn: [{ + htmlProse: `Foo enters a bar.` + }] + }, + + { + title: "keeps href in prose that defines a term in multi-page specs too", + html: `+ Foo enters a bar. +
`, + changesToBaseDfn: [{ + href: "https://www.w3.org/TR/foo/page1.html#foo", + htmlProse: `Foo enters a bar.`, + heading: { + href: 'https://www.w3.org/TR/foo/page1.html', + title: '' + } + }] + }, + + { + title: "extracts prose that defines a term without extra attributes", + html: `+ Foo enters a bar. +
`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + }, + + { + title: "suppresses asides from the prose that defines a term", + html: `So am I
+ Lots of tests + And annotations +