From 2ed2ac546c4c07f98c606ab0415a1b45bca19036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Daoust?= Date: Wed, 20 Dec 2023 09:52:25 +0100 Subject: [PATCH] [dfns] Add HTML prose definition when possible (#1444) Implements the logic discussed in https://github.com/w3c/respec/issues/4522 For each term defined in the specification being processed, the code now looks for some element flagged with a `data-defines="#term-id"` attribute. If such an element exists, a `htmlProse` property gets added to the definition in the `dfns` extract with the HTML contents of that element. The code applies some clean up to the HTML markup it attaches to the `prose` property: - All asides that authoring tools may add here and there get dropped - Any element that is not a simple block or inline content element gets dropped - All attributes are dropped, except dir, href, lang, title The clean up logic may need refinement over time once we gain experience with actual definitions. Open questions include: - Should we be stricter, e.g., only allowing `

`, `
`, and very common inline elements? - Should we keep `class` attributes for `

` elements to help with syntax
highlighting?
- Should we keep tables? Images?

There is no good mechanism in Reffy to report potential issues encountered
during extraction for the time being. In the meantime, warnings get logged when
the code bumps into elements that seem surprising in the context of a term
definition.
---
 schemas/browserlib/extract-dfns.json |   4 +
 src/browserlib/extract-dfns.mjs      |  64 +++++++++++++++-
 tests/extract-dfns.js                | 106 +++++++++++++++++++++++++++
 3 files changed, 173 insertions(+), 1 deletion(-)

diff --git a/schemas/browserlib/extract-dfns.json b/schemas/browserlib/extract-dfns.json
index 807e00f9..47b0deba 100644
--- a/schemas/browserlib/extract-dfns.json
+++ b/schemas/browserlib/extract-dfns.json
@@ -63,6 +63,10 @@
       },
       "definedIn": {
         "type": "string"
+      },
+      "htmlProse": {
+        "type": "string",
+        "minLength": 1
       }
     }
   }
diff --git a/src/browserlib/extract-dfns.mjs b/src/browserlib/extract-dfns.mjs
index 93ef9bb5..3a53b883 100644
--- a/src/browserlib/extract-dfns.mjs
+++ b/src/browserlib/extract-dfns.mjs
@@ -122,6 +122,57 @@ function isNotAlreadyExported(dfn, idx, list) {
   return first === dfn;
 }
 
+// Extract the element's inner HTML content, removing any complex structure,
+// so that the result can be injected elsewhere without creating problems.
+function getHtmlProseDefinition(proseEl) {
+  // Apply modifications to a copy of the element
+  proseEl = proseEl.cloneNode(true);
+
+  // Drop asides that authoring tools add here and there
+  let el;
+  const asideSelector = [
+    'aside', '.mdn-anno', '.wpt-tests-block', '.annotation',
+    '[id^=dfn-panel-]'
+  ].join(',');
+  while (el = proseEl.querySelector(asideSelector)) {
+    el.remove();
+  }
+
+  // Keep simple grouping content and text-level semantics elements
+  const keepSelector = [
+    'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li',
+    'ol', 'p', 'pre', 'ul',
+    'a', 'abbr', 'b', 'bdi', 'bdo', 'br', 'cite', 'code', 'data', 'dfn', 'em',
+    'i', 'kbd', 'mark', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'small', 'span',
+    'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr'
+  ].join(',');
+  while (el = proseEl.querySelector(`:not(${keepSelector})`)) {
+    // The content is more complex than anticipated. It may be worth checking
+    // the definition to assess whether the extraction logic needs to become
+    // smarter. For lack of a better reporting mechanism for now, let's record
+    // a warning.
+    console.warn('[reffy]', `Unexpected element "${el.nodeName}" found in textual definition of "${proseEl.getAttribute('data-defines')}"`);
+    el.remove();
+  }
+
+  // Drop all attributes except "href", "dir", "lang" and "title"
+  // For "href", let's make sure that we have an absolute URL
+  [...proseEl.querySelectorAll('*')].forEach(el => {
+    el.getAttributeNames().forEach(attr => {
+      if (attr === 'href') {
+        const page = el.closest('[data-reffy-page]')?.getAttribute('data-reffy-page');
+        const url = new URL(el.getAttribute('href'), page ?? window.location.href);
+        el.setAttribute('href', url.toString());
+      }
+      else if (!['dir', 'lang', 'title'].includes(attr)) {
+        el.removeAttribute(attr);
+      }
+    });
+  });
+
+  return proseEl.innerHTML.trim();
+}
+
 function definitionMapper(el, idToHeading, usesDfnDataModel) {
   let definedIn = 'prose';
   const enclosingEl = el.closest('dt,pre,table,h1,h2,h3,h4,h5,h6,.note,.example') || el;
@@ -157,7 +208,7 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
   url.hash = '#' + encodeURIComponent(el.getAttribute('id'));
   const href = url.toString();
 
-  return {
+  const dfn = {
     // ID is the id attribute
     // (ID may not be unique in a multi-page spec)
     id: el.getAttribute('id'),
@@ -211,6 +262,17 @@ function definitionMapper(el, idToHeading, usesDfnDataModel) {
     // indicates that definition appears in the main body of the specification)
     definedIn
   };
+
+  // Extract a prose definition in HTML for the term, if available
+  const proseEl = document.querySelector(`[data-defines="#${dfn.id}"]`);
+  if (proseEl) {
+    const htmlProse = getHtmlProseDefinition(proseEl);
+    if (htmlProse) {
+      dfn.htmlProse = htmlProse;
+    }
+  }
+
+  return dfn;
 }
 
 export default function (spec, idToHeading = {}) {
diff --git a/tests/extract-dfns.js b/tests/extract-dfns.js
index e305ae67..a17b89f7 100644
--- a/tests/extract-dfns.js
+++ b/tests/extract-dfns.js
@@ -603,6 +603,112 @@ When initialize(newItem) is called, the following steps are run:

` }], spec: "CSS2" }, + + { + title: "extracts the prose that defines a term", + html: `

+ Foo enters a bar. +

`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + }, + + { + title: "keeps basic structure for the prose that defines a term", + html: `
+

Foo enters a bar. +
The bar has 2 baz on tap:

+
    +
  • Baz1
  • +
  • Baz2
  • +
+
Foo bar baz
+
`, + changesToBaseDfn: [{ + htmlProse: `

Foo enters a bar. +
The bar has 2 baz on tap:

+ +
Foo bar baz
` + }] + }, + + { + title: "keeps useful attributes in prose that defines a term", + html: `

+ Foo enters a bar. +

`, + changesToBaseDfn: [{ + htmlProse: `Foo enters a bar.` + }] + }, + + { + title: "keeps href in prose that defines a term", + html: `

+ Foo enters a bar. +

`, + changesToBaseDfn: [{ + htmlProse: `Foo enters a bar.` + }] + }, + + { + title: "keeps href in prose that defines a term in multi-page specs too", + html: `

+ Foo enters a bar. +

`, + changesToBaseDfn: [{ + href: "https://www.w3.org/TR/foo/page1.html#foo", + htmlProse: `Foo enters a bar.`, + heading: { + href: 'https://www.w3.org/TR/foo/page1.html', + title: '' + } + }] + }, + + { + title: "extracts prose that defines a term without extra attributes", + html: `

+ Foo enters a . +

`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + }, + + { + title: "suppresses asides from the prose that defines a term", + html: `
+ Foo enters a bar. + +

So am I

+ Lots of tests + And annotations +
A list of references
+
`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + }, + + { + title: "suppresses more complex structure from the prose that defines a term", + html: `
+ Foo enters a bar. +
+

An inner section

+
+ A bar +
`, + changesToBaseDfn: [{ + htmlProse: "Foo enters a bar." + }] + } ]; describe("Test definition extraction", function () {