UI: Detect and restore encoding and BOM in content (#6727)

* detect and remove a decoded BOM

Signed-off-by: Andrew Thornton <art27@cantab.net>

* Restore the previous encoding and BOM

* On error keep as UTF-8

Signed-off-by: Andrew Thornton <art27@cantab.net>

* create remove BOM function

* Deal with LFSed content

* Update modules/repofiles/update.go

* Fix final LFS bug

* Keep LFS sections referring to opts.Content
This commit is contained in:
zeripath 2019-04-26 13:00:30 +01:00 committed by Lauris BH
parent 4c34bc111c
commit f6eedd4dc8
3 changed files with 114 additions and 7 deletions

View file

@ -5,6 +5,7 @@
package base
import (
"bytes"
"crypto/md5"
"crypto/rand"
"crypto/sha1"
@ -36,6 +37,9 @@ import (
"github.com/gogits/chardet"
)
// UTF8BOM is the utf-8 byte-order marker
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
// EncodeMD5 encodes string to md5 hex value.
func EncodeMD5(str string) string {
m := md5.New()
@ -91,6 +95,14 @@ func DetectEncoding(content []byte) (string, error) {
return result.Charset, err
}
// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
func RemoveBOMIfPresent(content []byte) []byte {
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
return content[3:]
}
return content
}
// BasicAuthDecode decode basic auth string
func BasicAuthDecode(encoded string) (string, string, error) {
s, err := base64.StdEncoding.DecodeString(encoded)

View file

@ -5,13 +5,19 @@
package repofiles
import (
"bytes"
"fmt"
"path"
"strings"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/sdk/gitea"
)
@ -37,6 +43,70 @@ type UpdateRepoFileOptions struct {
Committer *IdentityOptions
}
func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
reader, err := entry.Blob().DataAsync()
if err != nil {
// return default
return "UTF-8", false
}
defer reader.Close()
buf := make([]byte, 1024)
n, err := reader.Read(buf)
if err != nil {
// return default
return "UTF-8", false
}
buf = buf[:n]
if setting.LFS.StartServer {
meta := lfs.IsPointerFile(&buf)
if meta != nil {
meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
if err != nil && err != models.ErrLFSObjectNotExist {
// return default
return "UTF-8", false
}
}
if meta != nil {
dataRc, err := lfs.ReadMetaObject(meta)
if err != nil {
// return default
return "UTF-8", false
}
defer dataRc.Close()
buf = make([]byte, 1024)
n, err = dataRc.Read(buf)
if err != nil {
// return default
return "UTF-8", false
}
buf = buf[:n]
}
}
encoding, err := base.DetectEncoding(buf)
if err != nil {
// just default to utf-8 and no bom
return "UTF-8", false
}
if encoding == "UTF-8" {
return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
}
charsetEncoding, _ := charset.Lookup(encoding)
if charsetEncoding == nil {
return "UTF-8", false
}
result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
if n > 2 {
return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
}
return encoding, false
}
// CreateOrUpdateRepoFile adds or updates a file in the given repository
func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepoFileOptions) (*gitea.FileResponse, error) {
// If no branch name is set, assume master
@ -118,6 +188,9 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
opts.LastCommitID = commit.ID.String()
}
encoding := "UTF-8"
bom := false
if !opts.IsNewFile {
fromEntry, err := commit.GetTreeEntryByPath(fromTreePath)
if err != nil {
@ -151,6 +224,7 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
// haven't been made. We throw an error if one wasn't provided.
return nil, models.ErrSHAOrCommitIDNotProvided{}
}
encoding, bom = detectEncodingAndBOM(fromEntry, repo)
}
// For the path where this file will be created/updated, we need to make
@ -235,9 +309,28 @@ func CreateOrUpdateRepoFile(repo *models.Repository, doer *models.User, opts *Up
}
content := opts.Content
if bom {
content = string(base.UTF8BOM) + content
}
if encoding != "UTF-8" {
charsetEncoding, _ := charset.Lookup(encoding)
if charsetEncoding != nil {
result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
if err != nil {
// Look if we can't encode back in to the original we should just stick with utf-8
log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.TreePath, opts.FromTreePath, encoding, err)
result = content
}
content = result
} else {
log.Error("Unknown encoding: %s", encoding)
}
}
// Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content
opts.Content = content
var lfsMetaObject *models.LFSMetaObject
if filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
if setting.LFS.StartServer && filename2attribute2info[treePath] != nil && filename2attribute2info[treePath]["filter"] == "lfs" {
// OK so we are supposed to LFS this data!
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
if err != nil {

View file

@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(content), nil
return string(base.RemoveBOMIfPresent(content)), nil
}
encoding, _ := charset.Lookup(charsetLabel)
@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.String(encoding.NewDecoder(), string(content))
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = result + string(content[n:])
result = append(result, content[n:]...)
}
return result, err
result = base.RemoveBOMIfPresent(result)
return string(result), err
}
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return content
return base.RemoveBOMIfPresent(content)
}
encoding, _ := charset.Lookup(charsetLabel)
@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
return append(result, content[n:]...)
}
return result
return base.RemoveBOMIfPresent(result)
}
// ToUTF8 converts content to UTF8 encoding and ignore error