Skip to content

Commit

Permalink
lib: disallow < in URLs when parsing HTML
Browse files Browse the repository at this point in the history
Make sure that links placed verbatim inside HTML elements' bodies are
not parsed along with adjacent HTML tags as illustrated in the new test
case.

Also change the existing code to use the idiomatic Go way to get a
set-like functionality.

Changelog-fixed: Parsed links in HTML message parts now do not include
 trailing HTML tags.
Signed-off-by: Karel Balej <[email protected]>
Tested-by: Jakub Růžička <[email protected]>
  • Loading branch information
balejk authored and rjarry committed Feb 4, 2025
1 parent 3273d7e commit 02324e9
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 8 deletions.
2 changes: 1 addition & 1 deletion app/msgviewer.go
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ func (pv *PartViewer) hyperlinks(r io.Reader) (reader io.Reader) {
if !config.Viewer.ParseHttpLinks {
return r
}
reader, pv.links = parse.HttpLinks(r)
reader, pv.links = parse.HttpLinks(r, pv.part.FullMIMEType() == "text/html")
return reader
}

Expand Down
17 changes: 11 additions & 6 deletions lib/parse/hyperlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ var urlRe = regexp.MustCompile(
)

// HttpLinks searches a reader for a http link and returns a copy of the
// reader and a slice with links.
func HttpLinks(r io.Reader) (io.Reader, []string) {
// reader and a slice with links. If isHtml is true, left angle brackets are
// considered to always be right link delimiters.
func HttpLinks(r io.Reader, isHtml bool) (io.Reader, []string) {
buf, err := io.ReadAll(r)
if err != nil {
return r, nil
}

links := make(map[string]bool)
links := make(map[string]struct{})
b := buf
match := urlRe.FindSubmatchIndex(b)
for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
Expand All @@ -49,8 +50,12 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
paren++
j++
case '<':
ltgt++
j++
if isHtml {
emitUrl = true
} else {
ltgt++
j++
}
case ']':
bracket--
if bracket < 0 {
Expand Down Expand Up @@ -110,7 +115,7 @@ func HttpLinks(r io.Reader) (io.Reader, []string) {
// Email address with missing mailto: scheme. Add it.
url = "mailto:" + url
}
links[url] = true
links[url] = struct{}{}
b = b[j:]
}

Expand Down
14 changes: 13 additions & 1 deletion lib/parse/hyperlinks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ func TestHyperlinks(t *testing.T) {
name string
text string
links []string
html bool
}{
{
name: "http-link",
Expand Down Expand Up @@ -48,6 +49,7 @@ func TestHyperlinks(t *testing.T) {
name: "https-link-in-html",
text: "<a href=\"https://aerc-mail.org\">",
links: []string{"https://aerc-mail.org"},
html: true,
},
{
name: "https-link-twice",
Expand Down Expand Up @@ -84,6 +86,12 @@ func TestHyperlinks(t *testing.T) {
text: "text https://www.ics.uci.edu/pub/ietf/uri/#Related more text",
links: []string{"https://www.ics.uci.edu/pub/ietf/uri/#Related"},
},
{
name: "https-in-html",
text: "<div>text https://example.com/test<br>https://test.org/?a=b</div><br> text",
links: []string{"https://example.com/test", "https://test.org/?a=b"},
html: true,
},
{
name: "https-with-query",
text: "text https://www.example.com/index.php?id_sezione=360&sid=3a5ebc944f41daa6f849f730f1 more text",
Expand Down Expand Up @@ -118,28 +126,32 @@ func TestHyperlinks(t *testing.T) {
name: "simple email in <a href>",
text: `<a href="mailto:[email protected]" rel="noopener noreferrer">`,
links: []string{"mailto:[email protected]"},
html: true,
},
{
name: "simple email in <a> body",
text: `<a href="#" rel="noopener noreferrer">[email protected]</a><br/><p>more text</p>`,
links: []string{"mailto:[email protected]"},
html: true,
},
{
name: "emails in <a> href and body",
text: `<a href="mailto:[email protected]" rel="noopener noreferrer">[email protected]</a><br/><p>more text</p>`,
links: []string{"mailto:[email protected]", "mailto:[email protected]"},
html: true,
},
{
name: "email in &lt;...&gt;",
text: `<div>01.02.2023, 10:11, "Firstname Lastname" &lt;[email protected]&gt;:</div>`,
links: []string{"mailto:[email protected]"},
html: true,
},
}

for i, test := range tests {
t.Run(test.name, func(t *testing.T) {
// make sure reader is exact copy of input reader
reader, parsedLinks := parse.HttpLinks(strings.NewReader(test.text))
reader, parsedLinks := parse.HttpLinks(strings.NewReader(test.text), test.html)
if _, err := io.ReadAll(reader); err != nil {
t.Skipf("could not read text: %v", err)
}
Expand Down

0 comments on commit 02324e9

Please sign in to comment.