From b70e7991ea8bcb5a3a279d5a924552faca4d4f04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20H=C3=B6ffner?= <info@sebastian-hoeffner.de>
Date: Mon, 30 May 2022 03:34:12 +0200
Subject: [PATCH] markup: add --citeproc to pandoc converter

Adds the citeproc filter to the pandoc converter.

There are several PRs for it this feature already. However, I think
simply adding `--citeproc` is the cleanest way to enable this feature,
with the option to flesh it out later, e.g., in #7529.

Some PRs and issues attempt adding more config options to Hugo which
indirectly configure pandoc, but I think simply configuring Pandoc via
Pandoc itself is simpler, as it is already possible with two YAML
blocks -- one for Hugo, and one for Pandoc:

    ---
    title: This is the Hugo YAML block
    ---
    ---
    bibliography: assets/pandoc-yaml-block-bibliography.bib
    ...
    Document content with @citation!

There are other useful options, e.g., #4800 attempts to use `nocite`,
which works out of the box with this PR:

    ---
    title: This is the Hugo YAML block
    ---
    ---
    bibliography: assets/pandoc-yaml-block-bibliography.bib
    nocite: |
      @*
    ...
    Document content with no citations but a full bibliography:

    ## Bibliography

Other useful options are `csl: ...` and `link-citations: true`, which
set the path to a custom CSL file and create HTML links between the
references and the bibliography.

The following issues and PRs are related:

- Add support for parsing citations and Jupyter notebooks via Pandoc and/or Goldmark extension #6101
  Bundles multiple requests, this PR tackles citation parsing.

- WIP: Bibliography with Pandoc #4800
  Passes the frontmatter to Pandoc and still uses
  `--filter pandoc-citeproc` instead of `--citeproc`.
- Allow configuring Pandoc #7529
  That PR is much more extensive and might eventually supersede this PR,
  but I think --bibliography and --citeproc should be independent
  options (--bibliography should be optional and citeproc can always be
  specified).
- Pandoc - allow citeproc extension to be invoked, with bibliography. #8610
  Similar to #7529, #8610 adds a new config option to Hugo.
  I think passing --citeproc and letting the users decide on the
  metadata they want to pass to pandoc is better, albeit uglier.
---
 .../en/content-management/bibliography.md     |  50 +++++++
 docs/content/en/content-management/formats.md |   6 +
 markup/pandoc/convert.go                      |  72 ++++++++-
 markup/pandoc/convert_test.go                 | 137 +++++++++++++++++-
 markup/pandoc/testdata/bibliography.bib       |   6 +
 5 files changed, 266 insertions(+), 5 deletions(-)
 create mode 100644 docs/content/en/content-management/bibliography.md
 create mode 100644 markup/pandoc/testdata/bibliography.bib

diff --git a/docs/content/en/content-management/bibliography.md b/docs/content/en/content-management/bibliography.md
new file mode 100644
index 00000000000..315f76a5741
--- /dev/null
+++ b/docs/content/en/content-management/bibliography.md
@@ -0,0 +1,50 @@
+---
+title: Bibliographies in Markdown
+linkTitle: Bibliography
+description: Include citations and a bibliography in Markdown using LaTeX markup.
+categories: [content management]
+keywords: [latex,pandoc,citation,reference,bibliography]
+menu:
+  docs:
+    parent: content-management
+    weight: 320
+weight: 320
+toc: true
+---
+
+{{< new-in 0.144.0 />}}
+
+## Citations and Bibliographies
+
+[Pandoc](https://pandoc.org) is a universal document converter and can be used to convert markdown files.
+
+With **Pandoc >= 2.11**, you can use [citations](https://pandoc.org/MANUAL.html#extension-citations).
+One way is to employ [BibTeX files](https://en.wikibooks.org/wiki/LaTeX/Bibliography_Management#BibTeX) to cite:
+
+```
+---
+title: Citation document
+---
+---
+bibliography: assets/bibliography.bib
+...
+This is a citation: @Doe2022
+```
+
+Note that Hugo will **not** pass its metadata YAML block to Pandoc; however, it will pass the **second** meta data block, denoted with `---` and `...` to Pandoc.
+Thus, all Pandoc-specific settings should go there.
+
+You can also add all elements from a bibliography file (without citing them explicitly) using:
+
+```
+---
+title: My Publications
+---
+---
+bibliography: assets/bibliography.bib
+nocite: |
+  @*
+...
+```
+
+It is also possible to provide a custom [CSL style](https://citationstyles.org/authors/) by passing `csl: path-to-style.csl` as a Pandoc option.
diff --git a/docs/content/en/content-management/formats.md b/docs/content/en/content-management/formats.md
index a1f203f3c7d..b23d0cc5491 100644
--- a/docs/content/en/content-management/formats.md
+++ b/docs/content/en/content-management/formats.md
@@ -111,6 +111,12 @@ Hugo passes these CLI flags when calling the Pandoc executable:
 --mathjax
 ```
 
+If your Pandoc has version 2.11 or later, it also passes this CLI flag:
+
+```text
+--citeproc
+```
+
 [Pandoc]: https://pandoc.org/
 
 ### reStructuredText
diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go
index 8f2d99c9a83..3da923d3e96 100644
--- a/markup/pandoc/convert.go
+++ b/markup/pandoc/convert.go
@@ -15,10 +15,14 @@
 package pandoc
 
 import (
+	"bytes"
+	"strconv"
+	"strings"
+	"sync"
+
 	"github.com/gohugoio/hugo/common/hexec"
 	"github.com/gohugoio/hugo/htesting"
 	"github.com/gohugoio/hugo/identity"
-
 	"github.com/gohugoio/hugo/markup/converter"
 	"github.com/gohugoio/hugo/markup/internal"
 )
@@ -64,6 +68,9 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon
 		return src, nil
 	}
 	args := []string{"--mathjax"}
+	if supportsCitations(c.cfg) {
+		args = append(args[:], "--citeproc")
+	}
 	return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args)
 }
 
@@ -76,6 +83,69 @@ func getPandocBinaryName() string {
 	return ""
 }
 
+type pandocVersion struct {
+	major, minor int64
+}
+
+func (left pandocVersion) greaterThanOrEqual(right pandocVersion) bool {
+	return left.major > right.major || (left.major == right.major && left.minor >= right.minor)
+}
+
+var versionOnce sync.Once
+var foundPandocVersion pandocVersion
+
+// getPandocVersion parses the pandoc version output
+func getPandocVersion(cfg converter.ProviderConfig) (pandocVersion, error) {
+	var err error
+
+	versionOnce.Do(func() {
+		argsv := []any{"--version"}
+
+		var out bytes.Buffer
+		argsv = append(argsv, hexec.WithStdout(&out))
+
+		cmd, err := cfg.Exec.New(pandocBinary, argsv...)
+		if err != nil {
+			cfg.Logger.Errorf("Could not call pandoc: %v", err)
+			foundPandocVersion = pandocVersion{0, 0}
+			return
+		}
+
+		err = cmd.Run()
+		if err != nil {
+			cfg.Logger.Errorf("%s --version: %v", pandocBinary, err)
+			foundPandocVersion = pandocVersion{0, 0}
+			return
+		}
+
+		outbytes := bytes.Replace(out.Bytes(), []byte("\r"), []byte(""), -1)
+		output := strings.Split(string(outbytes), "\n")[0]
+		// Split, e.g., "pandoc 2.5" into 2 and 5 and convert them to integers
+		versionStrings := strings.Split(strings.Split(output, " ")[1], ".")
+		majorVersion, err := strconv.ParseInt(versionStrings[0], 10, 64)
+		if err != nil {
+			println(err)
+		}
+		minorVersion, err := strconv.ParseInt(versionStrings[1], 10, 64)
+		if err != nil {
+			println(err)
+		}
+		foundPandocVersion = pandocVersion{majorVersion, minorVersion}
+	})
+
+	return foundPandocVersion, err
+}
+
+// SupportsCitations returns true for pandoc versions >= 2.11, which include citeproc
+func supportsCitations(cfg converter.ProviderConfig) bool {
+	if Supports() {
+		foundPandocVersion, err := getPandocVersion(cfg)
+		supportsCitations := foundPandocVersion.greaterThanOrEqual(pandocVersion{2, 11}) && err == nil
+		return supportsCitations
+	}
+	return false
+}
+
 // Supports returns whether Pandoc is installed on this computer.
 func Supports() bool {
 	hasBin := getPandocBinaryName() != ""
diff --git a/markup/pandoc/convert_test.go b/markup/pandoc/convert_test.go
index dff6b1ed37a..4565338479f 100644
--- a/markup/pandoc/convert_test.go
+++ b/markup/pandoc/convert_test.go
@@ -25,7 +25,7 @@ import (
 	qt "github.com/frankban/quicktest"
 )
 
-func TestConvert(t *testing.T) {
+func setupTestConverter(t *testing.T) (*qt.C, converter.Converter, converter.ProviderConfig) {
 	if !Supports() {
 		t.Skip("pandoc not installed")
 	}
@@ -34,11 +34,140 @@ func TestConvert(t *testing.T) {
 	var err error
 	sc.Exec.Allow, err = security.NewWhitelist("pandoc")
 	c.Assert(err, qt.IsNil)
-	p, err := Provider.New(converter.ProviderConfig{Exec: hexec.New(sc, "", loggers.NewDefault()), Logger: loggers.NewDefault()})
+	cfg := converter.ProviderConfig{Exec: hexec.New(sc, "", loggers.NewDefault()), Logger: loggers.NewDefault()}
+	p, err := Provider.New(cfg)
 	c.Assert(err, qt.IsNil)
 	conv, err := p.New(converter.DocumentContext{})
 	c.Assert(err, qt.IsNil)
-	b, err := conv.Convert(converter.RenderContext{Src: []byte("testContent")})
+	return c, conv, cfg
+}
+
+func TestConvert(t *testing.T) {
+	c, conv, _ := setupTestConverter(t)
+	output, err := conv.Convert(converter.RenderContext{Src: []byte("testContent")})
 	c.Assert(err, qt.IsNil)
-	c.Assert(string(b.Bytes()), qt.Equals, "<p>testContent</p>\n")
+	c.Assert(string(output.Bytes()), qt.Equals, "<p>testContent</p>\n")
+}
+
+func runCiteprocTest(t *testing.T, content string, expectContained []string, expectNotContained []string) {
+	c, conv, cfg := setupTestConverter(t)
+	if !supportsCitations(cfg) {
+		t.Skip("pandoc does not support citations")
+	}
+	output, err := conv.Convert(converter.RenderContext{Src: []byte(content)})
+	c.Assert(err, qt.IsNil)
+	for _, expected := range expectContained {
+		c.Assert(string(output.Bytes()), qt.Contains, expected)
+	}
+	for _, notExpected := range expectNotContained {
+		c.Assert(string(output.Bytes()), qt.Not(qt.Contains), notExpected)
+	}
+}
+
+func TestGetPandocVersionCallTwice(t *testing.T) {
+	c, _, cfg := setupTestConverter(t)
+
+	version1, err1 := getPandocVersion(cfg)
+	version2, err2 := getPandocVersion(cfg)
+	c.Assert(version1, qt.Equals, version2)
+	c.Assert(err1, qt.IsNil)
+	c.Assert(err2, qt.IsNil)
+}
+
+func TestPandocVersionEquality(t *testing.T) {
+	c := qt.New(t)
+	v1 := pandocVersion{1, 0}
+	v2 := pandocVersion{2, 0}
+	v2_2 := pandocVersion{2, 2}
+	v1_2 := pandocVersion{1, 2}
+	v2_11 := pandocVersion{2, 11}
+	v3_9 := pandocVersion{3, 9}
+	v1_15 := pandocVersion{1, 15}
+
+	c.Assert(v1.greaterThanOrEqual(v1), qt.IsTrue)
+
+	c.Assert(v1.greaterThanOrEqual(v2), qt.IsFalse)
+	c.Assert(v2.greaterThanOrEqual(v1), qt.IsTrue)
+
+	c.Assert(v2.greaterThanOrEqual(v2_2), qt.IsFalse)
+	c.Assert(v2_2.greaterThanOrEqual(v2), qt.IsTrue)
+
+	c.Assert(v2_2.greaterThanOrEqual(v1_2), qt.IsTrue)
+	c.Assert(v1_2.greaterThanOrEqual(v2_2), qt.IsFalse)
+
+	c.Assert(v2_11.greaterThanOrEqual(v2_2), qt.IsTrue)
+	c.Assert(v2_2.greaterThanOrEqual(v2_11), qt.IsFalse)
+
+	c.Assert(v3_9.greaterThanOrEqual(v2_11), qt.IsTrue)
+	c.Assert(v2_11.greaterThanOrEqual(v3_9), qt.IsFalse)
+
+	c.Assert(v2_11.greaterThanOrEqual(v1_15), qt.IsTrue)
+	c.Assert(v1_15.greaterThanOrEqual(v2_11), qt.IsFalse)
+}
+
+func TestCiteprocWithHugoMeta(t *testing.T) {
+	content := `
+---
+title: Test
+published: 2022-05-30
+---
+testContent
+`
+	expected := []string{"testContent"}
+	unexpected := []string{"Doe", "Mustermann", "2022", "Treatise"}
+	runCiteprocTest(t, content, expected, unexpected)
+}
+
+func TestCiteprocWithPandocMeta(t *testing.T) {
+	content := `
+---
+---
+---
+...
+testContent
+`
+	expected := []string{"testContent"}
+	unexpected := []string{"Doe", "Mustermann", "2022", "Treatise"}
+	runCiteprocTest(t, content, expected, unexpected)
+}
+
+func TestCiteprocWithBibliography(t *testing.T) {
+	content := `
+---
+---
+---
+bibliography: testdata/bibliography.bib
+...
+testContent
+`
+	expected := []string{"testContent"}
+	unexpected := []string{"Doe", "Mustermann", "2022", "Treatise"}
+	runCiteprocTest(t, content, expected, unexpected)
+}
+
+func TestCiteprocWithExplicitCitation(t *testing.T) {
+	content := `
+---
+---
+---
+bibliography: testdata/bibliography.bib
+...
+@Doe2022
+`
+	expected := []string{"Doe", "Mustermann", "2022", "Treatise"}
+	runCiteprocTest(t, content, expected, []string{})
+}
+
+func TestCiteprocWithNocite(t *testing.T) {
+	content := `
+---
+---
+---
+bibliography: testdata/bibliography.bib
+nocite: |
+  @*
+...
+`
+	expected := []string{"Doe", "Mustermann", "2022", "Treatise"}
+	runCiteprocTest(t, content, expected, []string{})
 }
diff --git a/markup/pandoc/testdata/bibliography.bib b/markup/pandoc/testdata/bibliography.bib
new file mode 100644
index 00000000000..8fc1019b435
--- /dev/null
+++ b/markup/pandoc/testdata/bibliography.bib
@@ -0,0 +1,6 @@
+@article{Doe2022,
+    author    = "Jane Doe and Max Mustermann",
+    title     = "A Treatise on Hugo Tests",
+    journal   = "Hugo Websites",
+    year      = "2022",
+}