From d5c02cb358dad8c4c56113cb92e633a4c46bf9d1 Mon Sep 17 00:00:00 2001 From: lopez Date: Tue, 21 Jun 2022 13:47:02 +0200 Subject: [PATCH] fix #144 and update Sweble version --- build.gradle | 4 +-- .../mediaWiki/DefaultConfigEnWp.java | 2 +- .../utilities/mediaWiki/DefaultConfigWp.java | 7 ++-- .../mediaWiki/WikiTextConverter.java | 36 ++++++++++++++----- .../mediaWiki/TestMediaWikiParser.java | 15 ++++++++ .../nerd/utilities/mediaWiki/date_fr.txt | 1 + 6 files changed, 50 insertions(+), 15 deletions(-) create mode 100644 src/test/resources/com/scienceminer/nerd/utilities/mediaWiki/date_fr.txt diff --git a/build.gradle b/build.gradle index aec7e62c..f23e1e2f 100644 --- a/build.gradle +++ b/build.gradle @@ -111,8 +111,8 @@ dependencies { implementation group: 'de.ruedigermoeller', name: 'fst', version: '2.50' //Wikipedia - implementation group: 'org.sweble.wikitext', name: 'swc-parser-lazy', version: '3.1.5' - implementation group: 'org.sweble.wikitext', name: 'swc-engine', version: '3.1.5' + implementation group: 'org.sweble.wikitext', name: 'swc-parser-lazy', version: '3.1.9' + implementation group: 'org.sweble.wikitext', name: 'swc-engine', version: '3.1.9' //XML implementation group: 'com.thoughtworks.xstream', name: 'xstream', version: '1.4.19' diff --git a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigEnWp.java b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigEnWp.java index dc954bab..7aef6956 100644 --- a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigEnWp.java +++ b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigEnWp.java @@ -317,7 +317,7 @@ protected void addNamespaces(WikiConfigImpl c) "Gadget definition talk", false, false, - new ArrayList())); + new ArrayList())); c.setDefaultNamespace(c.getNamespace(0)); c.setTemplateNamespace(c.getNamespace(10)); diff --git a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigWp.java b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigWp.java index d8c9d53c..e87ba2df 100644 --- a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigWp.java +++ b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/DefaultConfigWp.java @@ -4035,12 +4035,12 @@ protected void addI18nAliases(WikiConfigImpl c) { "time", false, Arrays.asList("#time:"))); - /* + c.addI18nAlias(new I18nAliasImpl( "timel", false, Arrays.asList("timel"))); - c.addI18nAlias(new I18nAliasImpl( + /*c.addI18nAlias(new I18nAliasImpl( "rel2abs", false, Arrays.asList("rel2abs"))); @@ -4049,12 +4049,11 @@ protected void addI18nAliases(WikiConfigImpl c) { "titleparts", false, Arrays.asList("#titleparts:"))); - /* c.addI18nAlias(new I18nAliasImpl( "convert", false, Arrays.asList("convert"))); - c.addI18nAlias(new I18nAliasImpl( + /*c.addI18nAlias(new I18nAliasImpl( "sourceunit", false, Arrays.asList("#sourceunit"))); diff --git a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/WikiTextConverter.java b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/WikiTextConverter.java index 430a6153..3cf67e8a 100644 --- a/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/WikiTextConverter.java +++ b/src/main/java/com/scienceminer/nerd/utilities/mediaWiki/WikiTextConverter.java @@ -18,6 +18,7 @@ import org.sweble.wikitext.parser.nodes.WtItalics; import org.sweble.wikitext.parser.nodes.WtListItem; import org.sweble.wikitext.parser.nodes.WtNode; +import org.sweble.wikitext.parser.nodes.WtName; import org.sweble.wikitext.parser.nodes.WtNodeList; import org.sweble.wikitext.parser.nodes.WtOrderedList; import org.sweble.wikitext.parser.nodes.WtPageSwitch; @@ -361,24 +362,36 @@ public void visit(WtXmlElement e) { } } - // Stuff we want to hide - - public void visit(WtImageLink n) { + public void visit(WtTemplate n) { + System.out.println("processing template: "+n.getName()); + WtName templateName = n.getName(); + String templateNameString = templateName.getAsString(); + + // afaik templates are very ad hoc, so we only want to keep the argument values of some of them for proper + // text serialization + if (templateNameString != null && templateToKeep(templateNameString) && n.getArgs() != null) { + iterate(n.getArgs()); + } } - public void visit(WtIllegalCodePoint n) { + public void visit(WtTemplateArgument n) { + if (n.getValue() != null) + iterate(n.getValue()); + } - public void visit(WtXmlComment n) { + // Stuff we want to hide + + public void visit(WtTemplateParameter n) { } - public void visit(WtTemplate n) { + public void visit(WtImageLink n) { } - public void visit(WtTemplateArgument n) { + public void visit(WtIllegalCodePoint n) { } - public void visit(WtTemplateParameter n) { + public void visit(WtXmlComment n) { } public void visit(WtTagExtension n) { @@ -464,4 +477,11 @@ private void write(char ch) { private void write(int num) { writeWord(String.valueOf(num)); } + + private boolean templateToKeep(String templateNameString) { + //if (templateNameString.indexOf("date") != -1 || templateNameString.equals("MSAPI")) + if (templateNameString.indexOf("date") != -1) + return true; + return false; + } } \ No newline at end of file diff --git a/src/test/java/com/scienceminer/nerd/utilities/mediaWiki/TestMediaWikiParser.java b/src/test/java/com/scienceminer/nerd/utilities/mediaWiki/TestMediaWikiParser.java index 02b27be6..99e84df0 100644 --- a/src/test/java/com/scienceminer/nerd/utilities/mediaWiki/TestMediaWikiParser.java +++ b/src/test/java/com/scienceminer/nerd/utilities/mediaWiki/TestMediaWikiParser.java @@ -209,4 +209,19 @@ public void testWikiMediaTextWithInternalLinksArticlesOnlyPt() throws Exception assertThat(result.trim(), startsWith("[[Imagem:Scriptorium.jpg")); } + @Test + public void testWikiMediaTextWithDatesFr() throws Exception { + InputStream is = this.getClass().getResourceAsStream("date_fr.txt"); + String input = IOUtils.toString(is, UTF_8); + String result = mediaWikiParser.toTextWithInternalLinksArticlesOnly(input, "fr"); + + assertThat(result, containsString("[[")); + assertThat(result, containsString("]]")); + assertThat(result, not(containsString("'''"))); + + assertThat(result.trim(), startsWith("Emmanuel Macron")); + + System.out.println(result); + } + } \ No newline at end of file diff --git a/src/test/resources/com/scienceminer/nerd/utilities/mediaWiki/date_fr.txt b/src/test/resources/com/scienceminer/nerd/utilities/mediaWiki/date_fr.txt new file mode 100644 index 00000000..ef03f636 --- /dev/null +++ b/src/test/resources/com/scienceminer/nerd/utilities/mediaWiki/date_fr.txt @@ -0,0 +1 @@ +'''Emmanuel Macron''' ({{MSAPI|/ɛmanɥɛl makʁɔ̃/}}[[Prononciation du français|Prononciation]] en [[français de France]] [[Transcription phonétique|retranscrite]] selon la [[alphabet phonétique international|norme API]]. {{prononciation|LL-Q150 (fra)-Fabricio Cardenas (Culex)-Emmanuel Macron.wav}}), né le {{date de naissance|21 décembre 1977}} à [[Amiens]] ([[France]]), est un [[Haute fonction publique française|haut fonctionnaire]] et [[homme d'État]] [[France|français]]. Il est [[président de la République française]] depuis le {{date-|14 mai 2017}}.