Skip to content

Commit

Permalink
[dfns] Add HTML prose definition when possible (#1444)
Browse files Browse the repository at this point in the history
Implements the logic discussed in https://github.com/w3c/respec/issues/4522

For each term defined in the specification being processed, the code now looks
for some element flagged with a `data-defines="#term-id"` attribute. If such
an element exists, a `htmlProse` property gets added to the definition in the
`dfns` extract with the HTML contents of that element.

The code applies some clean up to the HTML markup it attaches to the `prose`
property:
- All asides that authoring tools may add here and there get dropped
- Any element that is not a simple block or inline content element gets dropped
- All attributes are dropped, except dir, href, lang, title

The clean up logic may need refinement over time once we gain experience with
actual definitions. Open questions include:

- Should we be stricter, e.g., only allowing `<p>`, `<br>`, and very common
inline elements?
- Should we keep `class` attributes for `<pre>` elements to help with syntax
highlighting?
- Should we keep tables? Images?

There is no good mechanism in Reffy to report potential issues encountered
during extraction for the time being. In the meantime, warnings get logged when
the code bumps into elements that seem surprising in the context of a term
definition.
  • Loading branch information
tidoust authored Dec 20, 2023
1 parent 923754a commit 2ed2ac5
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 1 deletion.
4 changes: 4 additions & 0 deletions schemas/browserlib/extract-dfns.json
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@
},
"definedIn": {
"type": "string"
},
"htmlProse": {
"type": "string",
"minLength": 1
}
}
}
Expand Down
64 changes: 63 additions & 1 deletion src/browserlib/extract-dfns.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,57 @@ function isNotAlreadyExported(dfn, idx, list) {
return first === dfn;
}

// Extract the element's inner HTML content, removing any complex structure,
// so that the result can be injected elsewhere without creating problems.
function getHtmlProseDefinition(proseEl) {
// Apply modifications to a copy of the element
proseEl = proseEl.cloneNode(true);

// Drop asides that authoring tools add here and there
let el;
const asideSelector = [
'aside', '.mdn-anno', '.wpt-tests-block', '.annotation',
'[id^=dfn-panel-]'
].join(',');
while (el = proseEl.querySelector(asideSelector)) {
el.remove();
}

// Keep simple grouping content and text-level semantics elements
const keepSelector = [
'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li',
'ol', 'p', 'pre', 'ul',
'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn', 'em',
'i', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'span',
'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr'
].join(',');
while (el = proseEl.querySelector(`:not(${keepSelector})`)) {
// The content is more complex than anticipated. It may be worth checking
// the definition to assess whether the extraction logic needs to become
// smarter. For lack of a better reporting mechanism for now, let's record
// a warning.
console.warn('[reffy]', `Unexpected element "${el.nodeName}" found in textual definition of "${proseEl.getAttribute('data-defines')}"`);
el.remove();
}

// Drop all attributes except "href", "dir", "lang" and "title"
// For "href", let's make sure that we have an absolute URL
[...proseEl.querySelectorAll('*')].forEach(el => {
el.getAttributeNames().forEach(attr => {
if (attr === 'href') {
const page = el.closest('[data-reffy-page]')?.getAttribute('data-reffy-page');
const url = new URL(el.getAttribute('href'), page ?? window.location.href);
el.setAttribute('href', url.toString());
}
else if (!['dir', 'lang', 'title'].includes(attr)) {
el.removeAttribute(attr);
}
});
});

return proseEl.innerHTML.trim();
}

function definitionMapper(el, idToHeading, usesDfnDataModel) {
let definedIn = 'prose';
const enclosingEl = el.closest('dt,pre,table,h1,h2,h3,h4,h5,h6,.note,.example') || el;
Expand Down Expand Up @@ -157,7 +208,7 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
url.hash = '#' + encodeURIComponent(el.getAttribute('id'));
const href = url.toString();

return {
const dfn = {
// ID is the id attribute
// (ID may not be unique in a multi-page spec)
id: el.getAttribute('id'),
Expand Down Expand Up @@ -211,6 +262,17 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
// indicates that definition appears in the main body of the specification)
definedIn
};

// Extract a prose definition in HTML for the term, if available
const proseEl = document.querySelector(`[data-defines="#${dfn.id}"]`);
if (proseEl) {
const htmlProse = getHtmlProseDefinition(proseEl);
if (htmlProse) {
dfn.htmlProse = htmlProse;
}
}

return dfn;
}

export default function (spec, idToHeading = {}) {
Expand Down
106 changes: 106 additions & 0 deletions tests/extract-dfns.js
Original file line number Diff line number Diff line change
Expand Up @@ -603,6 +603,112 @@ When initialize(<var>newItem</var>) is called, the following steps are run:</p>`
}],
spec: "CSS2"
},

{
title: "extracts the prose that defines a term",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a bar.
</p>`,
changesToBaseDfn: [{
htmlProse: "<dfn>Foo</dfn> enters a bar."
}]
},

{
title: "keeps basic structure for the prose that defines a term",
html: `<div data-defines='#foo'>
<p><dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i>enters</i> a <b>bar</b>.
<br>The bar has <strong>2 baz</strong> on tap:</p>
<ul>
<li>Baz<sub>1</sub></li>
<li>Baz<sup>2</sup></li>
</ul>
<pre>Foo bar baz</pre>
</div>`,
changesToBaseDfn: [{
htmlProse: `<p><dfn>Foo</dfn> <i>enters</i> a <b>bar</b>.
<br>The bar has <strong>2 baz</strong> on tap:</p>
<ul>
<li>Baz<sub>1</sub></li>
<li>Baz<sup>2</sup></li>
</ul>
<pre>Foo bar baz</pre>`
}]
},

{
title: "keeps useful attributes in prose that defines a term",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i dir="ltr">enters</i> a <a lang="en" title="Ze ol' tavern">bar</a>.
</p>`,
changesToBaseDfn: [{
htmlProse: `<dfn>Foo</dfn> <i dir="ltr">enters</i> a <a lang="en" title="Ze ol' tavern">bar</a>.`
}]
},

{
title: "keeps href in prose that defines a term",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a <a href="#bar">bar</a>.
</p>`,
changesToBaseDfn: [{
htmlProse: `<dfn>Foo</dfn> enters a <a href="about:blank#bar">bar</a>.`
}]
},

{
title: "keeps href in prose that defines a term in multi-page specs too",
html: `<p data-defines='#foo' data-reffy-page="https://www.w3.org/TR/foo/page1.html">
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a <a href="page2.html#bar">bar</a>.
</p>`,
changesToBaseDfn: [{
href: "https://www.w3.org/TR/foo/page1.html#foo",
htmlProse: `<dfn>Foo</dfn> enters a <a href="https://www.w3.org/TR/foo/page2.html#bar">bar</a>.`,
heading: {
href: 'https://www.w3.org/TR/foo/page1.html',
title: ''
}
}]
},

{
title: "extracts prose that defines a term without extra attributes",
html: `<p data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i class="verb">enters</i> a <a hidden inert tabindex=2>bar</a>.
</p>`,
changesToBaseDfn: [{
htmlProse: "<dfn>Foo</dfn> <i>enters</i> a <a>bar</a>."
}]
},

{
title: "suppresses asides from the prose that defines a term",
html: `<div data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> enters a bar.
<aside><p>I'm an aside</p></aside>
<p class='mdn-anno'>So am I</p>
<span class='wpt-tests-block'>Lots of tests</span>
<span class='annotation'>And annotations</span>
<div id='dfn-panel-foo'>A list of references</div>
</div>`,
changesToBaseDfn: [{
htmlProse: "<dfn>Foo</dfn> enters a bar."
}]
},

{
title: "suppresses more complex structure from the prose that defines a term",
html: `<div data-defines='#foo'>
<dfn id='foo' data-dfn-type='dfn'>Foo</dfn> <i class="verb">enters</i> a <a autofocus>bar</a>.
<section>
<h4>An inner section</h4>
</section>
<img src="bar.png" alt="A bar">
</div>`,
changesToBaseDfn: [{
htmlProse: "<dfn>Foo</dfn> <i>enters</i> a <a>bar</a>."
}]
}
];

describe("Test definition extraction", function () {
Expand Down

0 comments on commit 2ed2ac5

Please sign in to comment.