Detect charset and convert non UTF-8 files for display (#4950)
* Detect charset and convert non UTF-8 files for display * Refactor and move function to correct module * Revert unrelated changes * More unrelated changes * Duplicate content for small text to have better encoding detection * Check if original content is valid before duplicating it
This commit is contained in:
parent
6780661192
commit
81702e6ec9
3 changed files with 44 additions and 4 deletions
|
@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) {
|
||||||
return "UTF-8", nil
|
return "UTF-8", nil
|
||||||
}
|
}
|
||||||
|
|
||||||
result, err := chardet.NewTextDetector().DetectBest(content)
|
textDetector := chardet.NewTextDetector()
|
||||||
|
var detectContent []byte
|
||||||
|
if len(content) < 1024 {
|
||||||
|
// Check if original content is valid
|
||||||
|
if _, err := textDetector.DetectBest(content); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
times := 1024 / len(content)
|
||||||
|
detectContent = make([]byte, 0, times*len(content))
|
||||||
|
for i := 0; i < times; i++ {
|
||||||
|
detectContent = append(detectContent, content...)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
detectContent = content
|
||||||
|
}
|
||||||
|
result, err := textDetector.DetectBest(detectContent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
// Copyright 2018 The Gitea Authors. All rights reserved.
|
||||||
// Copyright 2014 The Gogs Authors. All rights reserved.
|
// Copyright 2014 The Gogs Authors. All rights reserved.
|
||||||
// Use of this source code is governed by a MIT-style
|
// Use of this source code is governed by a MIT-style
|
||||||
// license that can be found in the LICENSE file.
|
// license that can be found in the LICENSE file.
|
||||||
|
@ -275,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there is an error, we concatenate the nicely decoded part and the
|
// If there is an error, we concatenate the nicely decoded part and the
|
||||||
// original left over. This way we won't loose data.
|
// original left over. This way we won't lose data.
|
||||||
result, n, err := transform.String(encoding.NewDecoder(), string(content))
|
result, n, err := transform.String(encoding.NewDecoder(), string(content))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
result = result + string(content[n:])
|
result = result + string(content[n:])
|
||||||
|
@ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) {
|
||||||
return result, err
|
return result, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
|
||||||
|
func ToUTF8WithFallback(content []byte) []byte {
|
||||||
|
charsetLabel, err := base.DetectEncoding(content)
|
||||||
|
if err != nil || charsetLabel == "UTF-8" {
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
|
||||||
|
encoding, _ := charset.Lookup(charsetLabel)
|
||||||
|
if encoding == nil {
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is an error, we concatenate the nicely decoded part and the
|
||||||
|
// original left over. This way we won't lose data.
|
||||||
|
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
|
||||||
|
if err != nil {
|
||||||
|
return append(result, content[n:]...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// ToUTF8 converts content to UTF8 encoding and ignore error
|
// ToUTF8 converts content to UTF8 encoding and ignore error
|
||||||
func ToUTF8(content string) string {
|
func ToUTF8(content string) string {
|
||||||
res, _ := ToUTF8WithErr([]byte(content))
|
res, _ := ToUTF8WithErr([]byte(content))
|
||||||
|
|
|
@ -25,6 +25,7 @@ import (
|
||||||
"code.gitea.io/gitea/modules/markup"
|
"code.gitea.io/gitea/modules/markup"
|
||||||
"code.gitea.io/gitea/modules/setting"
|
"code.gitea.io/gitea/modules/setting"
|
||||||
"code.gitea.io/gitea/modules/templates"
|
"code.gitea.io/gitea/modules/templates"
|
||||||
|
|
||||||
"github.com/Unknwon/paginater"
|
"github.com/Unknwon/paginater"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) {
|
||||||
ctx.Data["FileSize"] = readmeFile.Size()
|
ctx.Data["FileSize"] = readmeFile.Size()
|
||||||
} else {
|
} else {
|
||||||
d, _ := ioutil.ReadAll(dataRc)
|
d, _ := ioutil.ReadAll(dataRc)
|
||||||
buf = append(buf, d...)
|
buf = templates.ToUTF8WithFallback(append(buf, d...))
|
||||||
|
|
||||||
if markup.Type(readmeFile.Name()) != "" {
|
if markup.Type(readmeFile.Name()) != "" {
|
||||||
ctx.Data["IsMarkup"] = true
|
ctx.Data["IsMarkup"] = true
|
||||||
ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
|
ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
|
||||||
|
@ -203,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
|
||||||
}
|
}
|
||||||
|
|
||||||
d, _ := ioutil.ReadAll(dataRc)
|
d, _ := ioutil.ReadAll(dataRc)
|
||||||
buf = append(buf, d...)
|
buf = templates.ToUTF8WithFallback(append(buf, d...))
|
||||||
|
|
||||||
readmeExist := markup.IsReadmeFile(blob.Name())
|
readmeExist := markup.IsReadmeFile(blob.Name())
|
||||||
ctx.Data["ReadmeExist"] = readmeExist
|
ctx.Data["ReadmeExist"] = readmeExist
|
||||||
|
|
Reference in a new issue