Improve code diff highlight, fix incorrect rendered diff result (#19958)

Use Unicode placeholders to replace HTML tags and HTML entities first, then do diff, then recover the HTML tags and HTML entities. Now the code diff with highlight has stable behavior, and won't emit broken tags.
This commit is contained in:
wxiaoguang 2022-07-23 19:28:02 +08:00 committed by GitHub
parent 14178c56bb
commit 3310dd1d19
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 379 additions and 378 deletions

View file

@ -40,9 +40,11 @@ var (
// NewContext loads custom highlight map from local config
func NewContext() {
once.Do(func() {
keys := setting.Cfg.Section("highlight.mapping").Keys()
for i := range keys {
highlightMapping[keys[i].Name()] = keys[i].Value()
if setting.Cfg != nil {
keys := setting.Cfg.Section("highlight.mapping").Keys()
for i := range keys {
highlightMapping[keys[i].Name()] = keys[i].Value()
}
}
// The size 512 is simply a conservative rule of thumb

View file

@ -15,7 +15,6 @@ import (
"io"
"net/url"
"os"
"regexp"
"sort"
"strings"
"time"
@ -40,7 +39,7 @@ import (
"golang.org/x/text/transform"
)
// DiffLineType represents the type of a DiffLine.
// DiffLineType represents the type of DiffLine.
type DiffLineType uint8
// DiffLineType possible values.
@ -51,7 +50,7 @@ const (
DiffLineSection
)
// DiffFileType represents the type of a DiffFile.
// DiffFileType represents the type of DiffFile.
type DiffFileType uint8
// DiffFileType possible values.
@ -100,12 +99,12 @@ type DiffLineSectionInfo struct {
// BlobExcerptChunkSize represent max lines of excerpt
const BlobExcerptChunkSize = 20
// GetType returns the type of a DiffLine.
// GetType returns the type of DiffLine.
func (d *DiffLine) GetType() int {
return int(d.Type)
}
// CanComment returns whether or not a line can get commented
// CanComment returns whether a line can get commented
func (d *DiffLine) CanComment() bool {
return len(d.Comments) == 0 && d.Type != DiffLineSection
}
@ -191,287 +190,13 @@ var (
codeTagSuffix = []byte(`</span>`)
)
var (
unfinishedtagRegex = regexp.MustCompile(`<[^>]*$`)
trailingSpanRegex = regexp.MustCompile(`<span\s*[[:alpha:]="]*?[>]?$`)
entityRegex = regexp.MustCompile(`&[#]*?[0-9[:alpha:]]*$`)
)
// shouldWriteInline represents combinations where we manually write inline changes
func shouldWriteInline(diff diffmatchpatch.Diff, lineType DiffLineType) bool {
if true &&
diff.Type == diffmatchpatch.DiffEqual ||
diff.Type == diffmatchpatch.DiffInsert && lineType == DiffLineAdd ||
diff.Type == diffmatchpatch.DiffDelete && lineType == DiffLineDel {
return true
}
return false
}
func fixupBrokenSpans(diffs []diffmatchpatch.Diff) []diffmatchpatch.Diff {
// Create a new array to store our fixed up blocks
fixedup := make([]diffmatchpatch.Diff, 0, len(diffs))
// semantically label some numbers
const insert, delete, equal = 0, 1, 2
// record the positions of the last type of each block in the fixedup blocks
last := []int{-1, -1, -1}
operation := []diffmatchpatch.Operation{diffmatchpatch.DiffInsert, diffmatchpatch.DiffDelete, diffmatchpatch.DiffEqual}
// create a writer for insert and deletes
toWrite := []strings.Builder{
{},
{},
}
// make some flags for insert and delete
unfinishedTag := []bool{false, false}
unfinishedEnt := []bool{false, false}
// store stores the provided text in the writer for the typ
store := func(text string, typ int) {
(&(toWrite[typ])).WriteString(text)
}
// hasStored returns true if there is stored content
hasStored := func(typ int) bool {
return (&toWrite[typ]).Len() > 0
}
// stored will return that content
stored := func(typ int) string {
return (&toWrite[typ]).String()
}
// empty will empty the stored content
empty := func(typ int) {
(&toWrite[typ]).Reset()
}
// pop will remove the stored content appending to a diff block for that typ
pop := func(typ int, fixedup []diffmatchpatch.Diff) []diffmatchpatch.Diff {
if hasStored(typ) {
if last[typ] > last[equal] {
fixedup[last[typ]].Text += stored(typ)
} else {
fixedup = append(fixedup, diffmatchpatch.Diff{
Type: operation[typ],
Text: stored(typ),
})
}
empty(typ)
}
return fixedup
}
// Now we walk the provided diffs and check the type of each block in turn
for _, diff := range diffs {
typ := delete // flag for handling insert or delete typs
switch diff.Type {
case diffmatchpatch.DiffEqual:
// First check if there is anything stored
if hasStored(insert) || hasStored(delete) {
// There are two reasons for storing content:
// 1. Unfinished Entity <- Could be more efficient here by not doing this if we're looking for a tag
if unfinishedEnt[insert] || unfinishedEnt[delete] {
// we look for a ';' to finish an entity
idx := strings.IndexRune(diff.Text, ';')
if idx >= 0 {
// if we find a ';' store the preceding content to both insert and delete
store(diff.Text[:idx+1], insert)
store(diff.Text[:idx+1], delete)
// and remove it from this block
diff.Text = diff.Text[idx+1:]
// reset the ent flags
unfinishedEnt[insert] = false
unfinishedEnt[delete] = false
} else {
// otherwise store it all on insert and delete
store(diff.Text, insert)
store(diff.Text, delete)
// and empty this block
diff.Text = ""
}
}
// 2. Unfinished Tag
if unfinishedTag[insert] || unfinishedTag[delete] {
// we look for a '>' to finish a tag
idx := strings.IndexRune(diff.Text, '>')
if idx >= 0 {
store(diff.Text[:idx+1], insert)
store(diff.Text[:idx+1], delete)
diff.Text = diff.Text[idx+1:]
unfinishedTag[insert] = false
unfinishedTag[delete] = false
} else {
store(diff.Text, insert)
store(diff.Text, delete)
diff.Text = ""
}
}
// If we've completed the required tag/entities
if !(unfinishedTag[insert] || unfinishedTag[delete] || unfinishedEnt[insert] || unfinishedEnt[delete]) {
// pop off the stack
fixedup = pop(insert, fixedup)
fixedup = pop(delete, fixedup)
}
// If that has left this diff block empty then shortcut
if len(diff.Text) == 0 {
continue
}
}
// check if this block ends in an unfinished tag?
idx := unfinishedtagRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedTag[insert] = true
unfinishedTag[delete] = true
} else {
// otherwise does it end in an unfinished entity?
idx = entityRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedEnt[insert] = true
unfinishedEnt[delete] = true
}
}
// If there is an unfinished component
if idx != nil {
// Store the fragment
store(diff.Text[idx[0]:], insert)
store(diff.Text[idx[0]:], delete)
// and remove it from this block
diff.Text = diff.Text[:idx[0]]
}
// If that hasn't left the block empty
if len(diff.Text) > 0 {
// store the position of the last equal block and store it in our diffs
last[equal] = len(fixedup)
fixedup = append(fixedup, diff)
}
continue
case diffmatchpatch.DiffInsert:
typ = insert
fallthrough
case diffmatchpatch.DiffDelete:
// First check if there is anything stored for this type
if hasStored(typ) {
// if there is prepend it to this block, empty the storage and reset our flags
diff.Text = stored(typ) + diff.Text
empty(typ)
unfinishedEnt[typ] = false
unfinishedTag[typ] = false
}
// check if this block ends in an unfinished tag
idx := unfinishedtagRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedTag[typ] = true
} else {
// otherwise does it end in an unfinished entity
idx = entityRegex.FindStringIndex(diff.Text)
if idx != nil {
unfinishedEnt[typ] = true
}
}
// If there is an unfinished component
if idx != nil {
// Store the fragment
store(diff.Text[idx[0]:], typ)
// and remove it from this block
diff.Text = diff.Text[:idx[0]]
}
// If that hasn't left the block empty
if len(diff.Text) > 0 {
// if the last block of this type was after the last equal block
if last[typ] > last[equal] {
// store this blocks content on that block
fixedup[last[typ]].Text += diff.Text
} else {
// otherwise store the position of the last block of this type and store the block
last[typ] = len(fixedup)
fixedup = append(fixedup, diff)
}
}
continue
}
}
// pop off any remaining stored content
fixedup = pop(insert, fixedup)
fixedup = pop(delete, fixedup)
return fixedup
}
func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineType) DiffInline {
func diffToHTML(lineWrapperTags []string, diffs []diffmatchpatch.Diff, lineType DiffLineType) string {
buf := bytes.NewBuffer(nil)
match := ""
diffs = fixupBrokenSpans(diffs)
// restore the line wrapper tags <span class="line"> and <span class="cl">, if necessary
for _, tag := range lineWrapperTags {
buf.WriteString(tag)
}
for _, diff := range diffs {
if shouldWriteInline(diff, lineType) {
if len(match) > 0 {
diff.Text = match + diff.Text
match = ""
}
// Chroma HTML syntax highlighting is done before diffing individual lines in order to maintain consistency.
// Since inline changes might split in the middle of a chroma span tag or HTML entity, make we manually put it back together
// before writing so we don't try insert added/removed code spans in the middle of one of those
// and create broken HTML. This is done by moving incomplete HTML forward until it no longer matches our pattern of
// a line ending with an incomplete HTML entity or partial/opening <span>.
// EX:
// diffs[{Type: dmp.DiffDelete, Text: "language</span><span "},
// {Type: dmp.DiffEqual, Text: "c"},
// {Type: dmp.DiffDelete, Text: "lass="p">}]
// After first iteration
// diffs[{Type: dmp.DiffDelete, Text: "language</span>"}, //write out
// {Type: dmp.DiffEqual, Text: "<span c"},
// {Type: dmp.DiffDelete, Text: "lass="p">,</span>}]
// After second iteration
// {Type: dmp.DiffEqual, Text: ""}, // write out
// {Type: dmp.DiffDelete, Text: "<span class="p">,</span>}]
// Final
// {Type: dmp.DiffDelete, Text: "<span class="p">,</span>}]
// end up writing <span class="removed-code"><span class="p">,</span></span>
// Instead of <span class="removed-code">lass="p",</span></span>
m := trailingSpanRegex.FindStringSubmatchIndex(diff.Text)
if m != nil {
match = diff.Text[m[0]:m[1]]
diff.Text = strings.TrimSuffix(diff.Text, match)
}
m = entityRegex.FindStringSubmatchIndex(diff.Text)
if m != nil {
match = diff.Text[m[0]:m[1]]
diff.Text = strings.TrimSuffix(diff.Text, match)
}
// Print an existing closing span first before opening added/remove-code span so it doesn't unintentionally close it
if strings.HasPrefix(diff.Text, "</span>") {
buf.WriteString("</span>")
diff.Text = strings.TrimPrefix(diff.Text, "</span>")
}
// If we weren't able to fix it then this should avoid broken HTML by not inserting more spans below
// The previous/next diff section will contain the rest of the tag that is missing here
if strings.Count(diff.Text, "<") != strings.Count(diff.Text, ">") {
buf.WriteString(diff.Text)
continue
}
}
switch {
case diff.Type == diffmatchpatch.DiffEqual:
buf.WriteString(diff.Text)
@ -485,7 +210,10 @@ func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineT
buf.Write(codeTagSuffix)
}
}
return DiffInlineWithUnicodeEscape(template.HTML(buf.String()))
for range lineWrapperTags {
buf.WriteString("</span>")
}
return buf.String()
}
// GetLine gets a specific line by type (add or del) and file line number
@ -597,10 +325,12 @@ func (diffSection *DiffSection) GetComputedInlineDiffFor(diffLine *DiffLine) Dif
return DiffInlineWithHighlightCode(diffSection.FileName, language, diffLine.Content)
}
diffRecord := diffMatchPatch.DiffMain(highlight.Code(diffSection.FileName, language, diff1[1:]), highlight.Code(diffSection.FileName, language, diff2[1:]), true)
diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord)
return diffToHTML(diffSection.FileName, diffRecord, diffLine.Type)
hcd := newHighlightCodeDiff()
diffRecord := hcd.diffWithHighlight(diffSection.FileName, language, diff1[1:], diff2[1:])
// it seems that Gitea doesn't need the line wrapper of Chroma, so do not add them back
// if the line wrappers are still needed in the future, it can be added back by "diffToHTML(hcd.lineWrapperTags. ...)"
diffHTML := diffToHTML(nil, diffRecord, diffLine.Type)
return DiffInlineWithUnicodeEscape(template.HTML(diffHTML))
}
// DiffFile represents a file diff.
@ -1289,7 +1019,7 @@ func readFileName(rd *strings.Reader) (string, bool) {
if char == '"' {
fmt.Fscanf(rd, "%q ", &name)
if len(name) == 0 {
log.Error("Reader has no file name: %v", rd)
log.Error("Reader has no file name: reader=%+v", rd)
return "", true
}
@ -1311,7 +1041,7 @@ func readFileName(rd *strings.Reader) (string, bool) {
}
}
if len(name) < 2 {
log.Error("Unable to determine name from reader: %v", rd)
log.Error("Unable to determine name from reader: reader=%+v", rd)
return "", true
}
return name[2:], ambiguity

View file

@ -7,7 +7,6 @@ package gitdiff
import (
"fmt"
"html/template"
"strconv"
"strings"
"testing"
@ -17,93 +16,27 @@ import (
"code.gitea.io/gitea/models/unittest"
user_model "code.gitea.io/gitea/models/user"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/highlight"
"code.gitea.io/gitea/modules/json"
"code.gitea.io/gitea/modules/setting"
dmp "github.com/sergi/go-diff/diffmatchpatch"
"github.com/stretchr/testify/assert"
"gopkg.in/ini.v1"
)
func assertEqual(t *testing.T, s1 string, s2 template.HTML) {
if s1 != string(s2) {
t.Errorf("Did not receive expected results:\nExpected: %s\nActual: %s", s1, s2)
}
}
func TestDiffToHTML(t *testing.T) {
setting.Cfg = ini.Empty()
assertEqual(t, "foo <span class=\"added-code\">bar</span> biz", diffToHTML("", []dmp.Diff{
assert.Equal(t, "foo <span class=\"added-code\">bar</span> biz", diffToHTML(nil, []dmp.Diff{
{Type: dmp.DiffEqual, Text: "foo "},
{Type: dmp.DiffInsert, Text: "bar"},
{Type: dmp.DiffDelete, Text: " baz"},
{Type: dmp.DiffEqual, Text: " biz"},
}, DiffLineAdd).Content)
}, DiffLineAdd))
assertEqual(t, "foo <span class=\"removed-code\">bar</span> biz", diffToHTML("", []dmp.Diff{
assert.Equal(t, "foo <span class=\"removed-code\">bar</span> biz", diffToHTML(nil, []dmp.Diff{
{Type: dmp.DiffEqual, Text: "foo "},
{Type: dmp.DiffDelete, Text: "bar"},
{Type: dmp.DiffInsert, Text: " baz"},
{Type: dmp.DiffEqual, Text: " biz"},
}, DiffLineDel).Content)
assertEqual(t, "<span class=\"k\">if</span> <span class=\"p\">!</span><span class=\"nx\">nohl</span> <span class=\"o\">&amp;&amp;</span> <span class=\"added-code\"><span class=\"p\">(</span></span><span class=\"nx\">lexer</span> <span class=\"o\">!=</span> <span class=\"kc\">nil</span><span class=\"added-code\"> <span class=\"o\">||</span> <span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nx\">GuessLanguage</span><span class=\"p\">)</span></span> <span class=\"p\">{</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "<span class=\"k\">if</span> <span class=\"p\">!</span><span class=\"nx\">nohl</span> <span class=\"o\">&amp;&amp;</span> <span class=\""},
{Type: dmp.DiffInsert, Text: "p\">(</span><span class=\""},
{Type: dmp.DiffEqual, Text: "nx\">lexer</span> <span class=\"o\">!=</span> <span class=\"kc\">nil"},
{Type: dmp.DiffInsert, Text: "</span> <span class=\"o\">||</span> <span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nx\">GuessLanguage</span><span class=\"p\">)"},
{Type: dmp.DiffEqual, Text: "</span> <span class=\"p\">{</span>"},
}, DiffLineAdd).Content)
assertEqual(t, "<span class=\"nx\">tagURL</span> <span class=\"o\">:=</span> <span class=\"removed-code\"><span class=\"nx\">fmt</span><span class=\"p\">.</span><span class=\"nf\">Sprintf</span><span class=\"p\">(</span><span class=\"s\">&#34;## [%s](%s/%s/%s/%s?q=&amp;type=all&amp;state=closed&amp;milestone=%d) - %s&#34;</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Milestone\"</span></span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">BaseURL</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Owner</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Repo</span><span class=\"p\">,</span> <span class=\"removed-code\"><span class=\"nx\">from</span><span class=\"p\">,</span> <span class=\"nx\">milestoneID</span><span class=\"p\">,</span> <span class=\"nx\">time</span><span class=\"p\">.</span><span class=\"nf\">Now</span><span class=\"p\">(</span><span class=\"p\">)</span><span class=\"p\">.</span><span class=\"nf\">Format</span><span class=\"p\">(</span><span class=\"s\">&#34;2006-01-02&#34;</span><span class=\"p\">)</span></span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "<span class=\"nx\">tagURL</span> <span class=\"o\">:=</span> <span class=\"n"},
{Type: dmp.DiffDelete, Text: "x\">fmt</span><span class=\"p\">.</span><span class=\"nf\">Sprintf</span><span class=\"p\">(</span><span class=\"s\">&#34;## [%s](%s/%s/%s/%s?q=&amp;type=all&amp;state=closed&amp;milestone=%d) - %s&#34;</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Milestone\""},
{Type: dmp.DiffInsert, Text: "f\">getGiteaTagURL</span><span class=\"p\">(</span><span class=\"nx\">client"},
{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">BaseURL</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Owner</span><span class=\"p\">,</span> <span class=\"nx\">ge</span><span class=\"p\">.</span><span class=\"nx\">Repo</span><span class=\"p\">,</span> <span class=\"nx\">"},
{Type: dmp.DiffDelete, Text: "from</span><span class=\"p\">,</span> <span class=\"nx\">milestoneID</span><span class=\"p\">,</span> <span class=\"nx\">time</span><span class=\"p\">.</span><span class=\"nf\">Now</span><span class=\"p\">(</span><span class=\"p\">)</span><span class=\"p\">.</span><span class=\"nf\">Format</span><span class=\"p\">(</span><span class=\"s\">&#34;2006-01-02&#34;</span><span class=\"p\">)"},
{Type: dmp.DiffInsert, Text: "ge</span><span class=\"p\">.</span><span class=\"nx\">Milestone</span><span class=\"p\">,</span> <span class=\"nx\">from</span><span class=\"p\">,</span> <span class=\"nx\">milestoneID"},
{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">)</span>"},
}, DiffLineDel).Content)
assertEqual(t, "<span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nf\">WrapperRenderer</span><span class=\"p\">(</span><span class=\"nx\">w</span><span class=\"p\">,</span> <span class=\"removed-code\"><span class=\"nx\">language</span><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "<span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nf\">WrapperRenderer</span><span class=\"p\">(</span><span class=\"nx\">w</span><span class=\"p\">,</span> <span class=\"nx\">"},
{Type: dmp.DiffDelete, Text: "language</span><span "},
{Type: dmp.DiffEqual, Text: "c"},
{Type: dmp.DiffDelete, Text: "lass=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs"},
{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>"},
}, DiffLineDel).Content)
assertEqual(t, "<span class=\"added-code\">language</span><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffInsert, Text: "language</span><span "},
{Type: dmp.DiffEqual, Text: "c"},
{Type: dmp.DiffInsert, Text: "lass=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs"},
{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>"},
}, DiffLineAdd).Content)
assertEqual(t, "<span class=\"k\">print</span><span class=\"added-code\"><span class=\"p\">(</span></span><span class=\"sa\"></span><span class=\"s2\">&#34;</span><span class=\"s2\">// </span><span class=\"s2\">&#34;</span><span class=\"p\">,</span> <span class=\"n\">sys</span><span class=\"o\">.</span><span class=\"n\">argv</span><span class=\"added-code\"><span class=\"p\">)</span></span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "<span class=\"k\">print</span>"},
{Type: dmp.DiffInsert, Text: "<span"},
{Type: dmp.DiffEqual, Text: " "},
{Type: dmp.DiffInsert, Text: "class=\"p\">(</span>"},
{Type: dmp.DiffEqual, Text: "<span class=\"sa\"></span><span class=\"s2\">&#34;</span><span class=\"s2\">// </span><span class=\"s2\">&#34;</span><span class=\"p\">,</span> <span class=\"n\">sys</span><span class=\"o\">.</span><span class=\"n\">argv</span>"},
{Type: dmp.DiffInsert, Text: "<span class=\"p\">)</span>"},
}, DiffLineAdd).Content)
assertEqual(t, "sh <span class=\"added-code\">&#39;useradd -u $(stat -c &#34;%u&#34; .gitignore) jenkins&#39;</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "sh &#3"},
{Type: dmp.DiffDelete, Text: "4;useradd -u 111 jenkins&#34"},
{Type: dmp.DiffInsert, Text: "9;useradd -u $(stat -c &#34;%u&#34; .gitignore) jenkins&#39"},
{Type: dmp.DiffEqual, Text: ";"},
}, DiffLineAdd).Content)
assertEqual(t, "<span class=\"x\"> &lt;h<span class=\"added-code\">4 class=&#34;release-list-title df ac&#34;</span>&gt;</span>", diffToHTML("", []dmp.Diff{
{Type: dmp.DiffEqual, Text: "<span class=\"x\"> &lt;h"},
{Type: dmp.DiffInsert, Text: "4 class=&#"},
{Type: dmp.DiffEqual, Text: "3"},
{Type: dmp.DiffInsert, Text: "4;release-list-title df ac&#34;"},
{Type: dmp.DiffEqual, Text: "&gt;</span>"},
}, DiffLineAdd).Content)
}, DiffLineDel))
}
func TestParsePatch_skipTo(t *testing.T) {
@ -592,7 +525,6 @@ index 0000000..6bb8f39
if err != nil {
t.Errorf("ParsePatch failed: %s", err)
}
println(result)
diff2 := `diff --git "a/A \\ B" "b/A \\ B"
--- "a/A \\ B"
@ -712,18 +644,6 @@ func TestGetDiffRangeWithWhitespaceBehavior(t *testing.T) {
}
}
func TestDiffToHTML_14231(t *testing.T) {
setting.Cfg = ini.Empty()
diffRecord := diffMatchPatch.DiffMain(highlight.Code("main.v", "", " run()\n"), highlight.Code("main.v", "", " run(db)\n"), true)
diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord)
expected := `<span class="line"><span class="cl"> <span class="n">run</span><span class="added-code"><span class="o">(</span><span class="n">db</span></span><span class="o">)</span>
</span></span>`
output := diffToHTML("main.v", diffRecord, DiffLineAdd)
assertEqual(t, expected, output.Content)
}
func TestNoCrashes(t *testing.T) {
type testcase struct {
gitdiff string

View file

@ -0,0 +1,223 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package gitdiff
import (
"strings"
"code.gitea.io/gitea/modules/highlight"
"github.com/sergi/go-diff/diffmatchpatch"
)
// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
func extractHTMLToken(s string) (before, token, after string, valid bool) {
for pos1 := 0; pos1 < len(s); pos1++ {
if s[pos1] == '<' {
pos2 := strings.IndexByte(s[pos1:], '>')
if pos2 == -1 {
return "", "", s, false
}
return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
} else if s[pos1] == '&' {
pos2 := strings.IndexByte(s[pos1:], ';')
if pos2 == -1 {
return "", "", s, false
}
return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
}
}
return "", "", s, true
}
// highlightCodeDiff is used to do diff with highlighted HTML code.
// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
// These Unicode placeholders are friendly to the diff.
// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
// It's guaranteed that the tags in final diff result are paired correctly.
type highlightCodeDiff struct {
placeholderBegin rune
placeholderMaxCount int
placeholderIndex int
placeholderTokenMap map[rune]string
tokenPlaceholderMap map[string]rune
placeholderOverflowCount int
lineWrapperTags []string
}
func newHighlightCodeDiff() *highlightCodeDiff {
return &highlightCodeDiff{
placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
placeholderMaxCount: 64000,
placeholderTokenMap: map[rune]string{},
tokenPlaceholderMap: map[string]rune{},
}
}
// nextPlaceholder returns 0 if no more placeholder can be used
// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
// so the placeholderMaxCount is impossible to be exhausted in real cases.
func (hcd *highlightCodeDiff) nextPlaceholder() rune {
for hcd.placeholderIndex < hcd.placeholderMaxCount {
r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
hcd.placeholderIndex++
// only use non-existing (not used by code) rune as placeholders
if _, ok := hcd.placeholderTokenMap[r]; !ok {
return r
}
}
return 0 // no more available placeholder
}
func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
}
func (hcd *highlightCodeDiff) collectUsedRunes(code string) {
for _, r := range code {
if hcd.isInPlaceholderRange(r) {
// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
hcd.placeholderTokenMap[r] = ""
}
}
}
func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
hcd.collectUsedRunes(codeA)
hcd.collectUsedRunes(codeB)
highlightCodeA := highlight.Code(filename, language, codeA)
highlightCodeB := highlight.Code(filename, language, codeB)
highlightCodeA = hcd.convertToPlaceholders(highlightCodeA)
highlightCodeB = hcd.convertToPlaceholders(highlightCodeB)
diffs := diffMatchPatch.DiffMain(highlightCodeA, highlightCodeB, true)
diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
for i := range diffs {
hcd.recoverOneDiff(&diffs[i])
}
return diffs
}
// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string {
var tagStack []string
res := strings.Builder{}
firstRunForLineTags := hcd.lineWrapperTags == nil
var beforeToken, token string
var valid bool
// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
for {
beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
if !valid || token == "" {
break
}
// write the content before the token into result string, and consume the token in the string
res.WriteString(beforeToken)
// the line wrapper tags should be removed before diff
if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
if firstRunForLineTags {
// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
}
htmlCode = strings.TrimSuffix(htmlCode, "</span>")
continue
}
var tokenInMap string
if strings.HasSuffix(token, "</") { // for closing tag
if len(tagStack) == 0 {
break // invalid diff result, no opening tag but see closing tag
}
// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' { // for opening tag
tokenInMap = token
tagStack = append(tagStack, token)
} else if token[0] == '&' { // for html entity
tokenInMap = token
} // else: impossible
// remember the placeholder and token in the map
placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
if !ok {
placeholder = hcd.nextPlaceholder()
if placeholder != 0 {
hcd.tokenPlaceholderMap[tokenInMap] = placeholder
hcd.placeholderTokenMap[placeholder] = tokenInMap
}
}
if placeholder != 0 {
res.WriteRune(placeholder) // use the placeholder to replace the token
} else {
// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
hcd.placeholderOverflowCount++
if strings.HasPrefix(token, "&") {
// when the token is a html entity, something must be outputted even if there is no placeholder.
res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully?
res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
}
}
}
// write the remaining string
res.WriteString(htmlCode)
return res.String()
}
func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
sb := strings.Builder{}
var tagStack []string
for _, r := range diff.Text {
token, ok := hcd.placeholderTokenMap[r]
if !ok || token == "" {
sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
continue
}
var tokenToRecover string
if strings.HasPrefix(token, "</") { // for closing tag
// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
tokenToRecover = token[:strings.IndexByte(token, '>')+1]
if len(tagStack) == 0 {
continue // if no opening tag in stack yet, skip the closing tag
}
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' { // for opening tag
tokenToRecover = token
tagStack = append(tagStack, token)
} else if token[0] == '&' { // for html entity
tokenToRecover = token
} // else: impossible
sb.WriteString(tokenToRecover)
}
if len(tagStack) > 0 {
// close all opening tags
for i := len(tagStack) - 1; i >= 0; i-- {
tagToClose := tagStack[i]
// get the closing tag "</span>" from "<span class=...>" or "<span>"
pos := strings.IndexAny(tagToClose, " >")
if pos != -1 {
sb.WriteString("</" + tagToClose[1:pos] + ">")
} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
}
}
diff.Text = sb.String()
}

View file

@ -0,0 +1,126 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package gitdiff
import (
"fmt"
"strings"
"testing"
"github.com/sergi/go-diff/diffmatchpatch"
"github.com/stretchr/testify/assert"
)
func TestDiffWithHighlight(t *testing.T) {
hcd := newHighlightCodeDiff()
diffs := hcd.diffWithHighlight(
"main.v", "",
" run('<>')\n",
" run(db)\n",
)
expected := ` <span class="n">run</span><span class="o">(</span><span class="removed-code"><span class="k">&#39;</span><span class="o">&lt;</span><span class="o">&gt;</span><span class="k">&#39;</span></span><span class="o">)</span>` + "\n"
output := diffToHTML(nil, diffs, DiffLineDel)
assert.Equal(t, expected, output)
expected = ` <span class="n">run</span><span class="o">(</span><span class="added-code"><span class="n">db</span></span><span class="o">)</span>` + "\n"
output = diffToHTML(nil, diffs, DiffLineAdd)
assert.Equal(t, expected, output)
hcd = newHighlightCodeDiff()
hcd.placeholderTokenMap['O'] = "<span>"
hcd.placeholderTokenMap['C'] = "</span>"
diff := diffmatchpatch.Diff{}
diff.Text = "OC"
hcd.recoverOneDiff(&diff)
assert.Equal(t, "<span></span>", diff.Text)
diff.Text = "O"
hcd.recoverOneDiff(&diff)
assert.Equal(t, "<span></span>", diff.Text)
diff.Text = "C"
hcd.recoverOneDiff(&diff)
assert.Equal(t, "", diff.Text)
}
func TestDiffWithHighlightPlaceholder(t *testing.T) {
hcd := newHighlightCodeDiff()
diffs := hcd.diffWithHighlight(
"main.js", "",
"a='\U00100000'",
"a='\U0010FFFD''",
)
assert.Equal(t, "", hcd.placeholderTokenMap[0x00100000])
assert.Equal(t, "", hcd.placeholderTokenMap[0x0010FFFD])
expected := fmt.Sprintf(`<span class="line"><span class="cl"><span class="nx">a</span><span class="o">=</span><span class="s1">&#39;</span><span class="removed-code">%s</span>&#39;</span></span>`, "\U00100000")
output := diffToHTML(hcd.lineWrapperTags, diffs, DiffLineDel)
assert.Equal(t, expected, output)
hcd = newHighlightCodeDiff()
diffs = hcd.diffWithHighlight(
"main.js", "",
"a='\U00100000'",
"a='\U0010FFFD'",
)
expected = fmt.Sprintf(`<span class="nx">a</span><span class="o">=</span><span class="s1">&#39;</span><span class="added-code">%s</span>&#39;`, "\U0010FFFD")
output = diffToHTML(nil, diffs, DiffLineAdd)
assert.Equal(t, expected, output)
}
func TestDiffWithHighlightPlaceholderExhausted(t *testing.T) {
hcd := newHighlightCodeDiff()
hcd.placeholderMaxCount = 0
diffs := hcd.diffWithHighlight(
"main.js", "",
"'",
``,
)
output := diffToHTML(nil, diffs, DiffLineDel)
expected := fmt.Sprintf(`<span class="removed-code">%s#39;</span>`, "\uFFFD")
assert.Equal(t, expected, output)
hcd = newHighlightCodeDiff()
hcd.placeholderMaxCount = 0
diffs = hcd.diffWithHighlight(
"main.js", "",
"a < b",
"a > b",
)
output = diffToHTML(nil, diffs, DiffLineDel)
expected = fmt.Sprintf(`a %s<span class="removed-code">l</span>t; b`, "\uFFFD")
assert.Equal(t, expected, output)
output = diffToHTML(nil, diffs, DiffLineAdd)
expected = fmt.Sprintf(`a %s<span class="added-code">g</span>t; b`, "\uFFFD")
assert.Equal(t, expected, output)
}
func TestDiffWithHighlightTagMatch(t *testing.T) {
totalOverflow := 0
for i := 0; i < 100; i++ {
hcd := newHighlightCodeDiff()
hcd.placeholderMaxCount = i
diffs := hcd.diffWithHighlight(
"main.js", "",
"a='1'",
"b='2'",
)
totalOverflow += hcd.placeholderOverflowCount
output := diffToHTML(nil, diffs, DiffLineDel)
c1 := strings.Count(output, "<span")
c2 := strings.Count(output, "</span")
assert.Equal(t, c1, c2)
output = diffToHTML(nil, diffs, DiffLineAdd)
c1 = strings.Count(output, "<span")
c2 = strings.Count(output, "</span")
assert.Equal(t, c1, c2)
}
assert.NotZero(t, totalOverflow)
}