From 2242a9f82e26ac8a4725a8ffc0aa0cfb25aed30a Mon Sep 17 00:00:00 2001 From: silverwind Date: Sat, 6 Apr 2019 20:28:45 +0200 Subject: [PATCH] Improve SHA1 link detection (#6526) This improves the SHA1 link detection to not pick up extraneous non-whitespace characters at the end of the URL. The '.' is a special case handled in code itself because of missing regexp lookahead support. Regex test cases: https://regex101.com/r/xUMlqh/3 --- modules/markup/html.go | 53 ++++++++++++++++++---------- modules/markup/html_internal_test.go | 10 +++--- 2 files changed, 39 insertions(+), 24 deletions(-) diff --git a/modules/markup/html.go b/modules/markup/html.go index e016b67d0c..7bd8e8d8f4 100644 --- a/modules/markup/html.go +++ b/modules/markup/html.go @@ -54,7 +54,7 @@ var ( shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`) // anySHA1Pattern allows to split url containing SHA into parts - anySHA1Pattern = regexp.MustCompile(`https?://(?:\S+/){4}([0-9a-f]{40})/?([^#\s]+)?(?:#(\S+))?`) + anySHA1Pattern = regexp.MustCompile(`https?://(?:\S+/){4}([0-9a-f]{40})(/[^#\s]+)?(#\S+)?`) validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`) @@ -594,31 +594,46 @@ func fullSha1PatternProcessor(ctx *postProcessCtx, node *html.Node) { if m == nil { return } - // take out what's relevant + urlFull := node.Data[m[0]:m[1]] - hash := node.Data[m[2]:m[3]] + text := base.ShortSha(node.Data[m[2]:m[3]]) - var subtree, line string - - // optional, we do them depending on the length. - if m[7] > 0 { - line = node.Data[m[6]:m[7]] - } + // 3rd capture group matches a optional path + subpath := "" if m[5] > 0 { - subtree = node.Data[m[4]:m[5]] + subpath = node.Data[m[4]:m[5]] } - text := base.ShortSha(hash) - if subtree != "" { - text += "/" + subtree - } - if line != "" { - text += " (" - text += line - text += ")" + // 4th capture group matches a optional url hash + hash := "" + if m[7] > 0 { + hash = node.Data[m[6]:m[7]][1:] } - replaceContent(node, m[0], m[1], createLink(urlFull, text)) + start := m[0] + end := m[1] + + // If url ends in '.', it's very likely that it is not part of the + // actual url but used to finish a sentence. + if strings.HasSuffix(urlFull, ".") { + end-- + urlFull = urlFull[:len(urlFull)-1] + if hash != "" { + hash = hash[:len(hash)-1] + } else if subpath != "" { + subpath = subpath[:len(subpath)-1] + } + } + + if subpath != "" { + text += subpath + } + + if hash != "" { + text += " (" + hash + ")" + } + + replaceContent(node, start, end, createLink(urlFull, text)) } // sha1CurrentPatternProcessor renders SHA1 strings to corresponding links that diff --git a/modules/markup/html_internal_test.go b/modules/markup/html_internal_test.go index ff07bab913..b8612eb2bb 100644 --- a/modules/markup/html_internal_test.go +++ b/modules/markup/html_internal_test.go @@ -273,12 +273,12 @@ func TestRegExp_anySHA1Pattern(t *testing.T) { testCases := map[string][]string{ "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js#L2703": { "a644101ed04d0beacea864ce805e0c4f86ba1cd1", - "test/unit/event.js", - "L2703", + "/test/unit/event.js", + "#L2703", }, "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js": { "a644101ed04d0beacea864ce805e0c4f86ba1cd1", - "test/unit/event.js", + "/test/unit/event.js", "", }, "https://github.com/jquery/jquery/commit/0705be475092aede1eddae01319ec931fb9c65fc": { @@ -288,13 +288,13 @@ func TestRegExp_anySHA1Pattern(t *testing.T) { }, "https://github.com/jquery/jquery/tree/0705be475092aede1eddae01319ec931fb9c65fc/src": { "0705be475092aede1eddae01319ec931fb9c65fc", - "src", + "/src", "", }, "https://try.gogs.io/gogs/gogs/commit/d8a994ef243349f321568f9e36d5c3f444b99cae#diff-2": { "d8a994ef243349f321568f9e36d5c3f444b99cae", "", - "diff-2", + "#diff-2", }, }