Fix chardet test and add ordering option (#11621)
* Fix chardet test and add ordering option Signed-off-by: Andrew Thornton <art27@cantab.net> * minor fixes Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log Signed-off-by: Andrew Thornton <art27@cantab.net> * remove log2 Signed-off-by: Andrew Thornton <art27@cantab.net> * only iterate through top results Signed-off-by: Andrew Thornton <art27@cantab.net> * Update docs/content/doc/advanced/config-cheat-sheet.en-us.md * slight restructure of for loop Signed-off-by: Andrew Thornton <art27@cantab.net> Co-authored-by: techknowlogick <techknowlogick@gitea.io>
This commit is contained in:
parent
fe2cacf5ea
commit
a1ad188326
5 changed files with 117 additions and 6 deletions
|
@ -14,7 +14,12 @@ RUN_MODE = dev
|
||||||
[repository]
|
[repository]
|
||||||
ROOT =
|
ROOT =
|
||||||
SCRIPT_TYPE = bash
|
SCRIPT_TYPE = bash
|
||||||
; Default ANSI charset
|
; DETECTED_CHARSETS_ORDER tie-break order for detected charsets.
|
||||||
|
; If the charsets have equal confidence, tie-breaking will be done by order in this list
|
||||||
|
; with charsets earlier in the list chosen in preference to those later.
|
||||||
|
; Adding "defaults" will place the unused charsets at that position.
|
||||||
|
DETECTED_CHARSETS_ORDER=UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr
|
||||||
|
; Default ANSI charset to override non-UTF-8 charsets to
|
||||||
ANSI_CHARSET =
|
ANSI_CHARSET =
|
||||||
; Force every new repository to be private
|
; Force every new repository to be private
|
||||||
FORCE_PRIVATE = false
|
FORCE_PRIVATE = false
|
||||||
|
|
|
@ -46,7 +46,8 @@ Values containing `#` or `;` must be quoted using `` ` `` or `"""`.
|
||||||
an absolute path.
|
an absolute path.
|
||||||
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
|
- `SCRIPT_TYPE`: **bash**: The script type this server supports. Usually this is `bash`,
|
||||||
but some users report that only `sh` is available.
|
but some users report that only `sh` is available.
|
||||||
- `ANSI_CHARSET`: **\<empty\>**: The default charset for an unrecognized charset.
|
- `DETECTED_CHARSETS_ORDER`: **UTF-8, UTF-16BE, UTF-16LE, UTF-32BE, UTF-32LE, ISO-8859, windows-1252, ISO-8859, windows-1250, ISO-8859, ISO-8859, ISO-8859, windows-1253, ISO-8859, windows-1255, ISO-8859, windows-1251, windows-1256, KOI8-R, ISO-8859, windows-1254, Shift_JIS, GB18030, EUC-JP, EUC-KR, Big5, ISO-2022, ISO-2022, ISO-2022, IBM424_rtl, IBM424_ltr, IBM420_rtl, IBM420_ltr**: Tie-break order of detected charsets - if the detected charsets have equal confidence, charsets earlier in the list will be chosen in preference to those later. Adding `defaults` will place the unnamed charsets at that point.
|
||||||
|
- `ANSI_CHARSET`: **\<empty\>**: Default ANSI charset to override non-UTF-8 charsets to.
|
||||||
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
|
- `FORCE_PRIVATE`: **false**: Force every new repository to be private.
|
||||||
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
|
- `DEFAULT_PRIVATE`: **last**: Default private when creating a new repository.
|
||||||
\[last, private, public\]
|
\[last, private, public\]
|
||||||
|
|
|
@ -7,6 +7,7 @@ package charset
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"code.gitea.io/gitea/modules/log"
|
"code.gitea.io/gitea/modules/log"
|
||||||
|
@ -137,16 +138,42 @@ func DetectEncoding(content []byte) (string, error) {
|
||||||
} else {
|
} else {
|
||||||
detectContent = content
|
detectContent = content
|
||||||
}
|
}
|
||||||
result, err := textDetector.DetectBest(detectContent)
|
|
||||||
|
// Now we can't use DetectBest or just results[0] because the result isn't stable - so we need a tie break
|
||||||
|
results, err := textDetector.DetectAll(detectContent)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if err == chardet.NotDetectedError && len(setting.Repository.AnsiCharset) > 0 {
|
||||||
|
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||||
|
return setting.Repository.AnsiCharset, nil
|
||||||
|
}
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
topConfidence := results[0].Confidence
|
||||||
|
topResult := results[0]
|
||||||
|
priority, has := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(topResult.Charset))]
|
||||||
|
for _, result := range results {
|
||||||
|
// As results are sorted in confidence order - if we have a different confidence
|
||||||
|
// we know it's less than the current confidence and can break out of the loop early
|
||||||
|
if result.Confidence != topConfidence {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
// Otherwise check if this results is earlier in the DetectedCharsetOrder than our current top guesss
|
||||||
|
resultPriority, resultHas := setting.Repository.DetectedCharsetScore[strings.ToLower(strings.TrimSpace(result.Charset))]
|
||||||
|
if resultHas && (!has || resultPriority < priority) {
|
||||||
|
topResult = result
|
||||||
|
priority = resultPriority
|
||||||
|
has = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
|
// FIXME: to properly decouple this function the fallback ANSI charset should be passed as an argument
|
||||||
if result.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
|
if topResult.Charset != "UTF-8" && len(setting.Repository.AnsiCharset) > 0 {
|
||||||
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
log.Debug("Using default AnsiCharset: %s", setting.Repository.AnsiCharset)
|
||||||
return setting.Repository.AnsiCharset, err
|
return setting.Repository.AnsiCharset, err
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("Detected encoding: %s", result.Charset)
|
log.Debug("Detected encoding: %s", topResult.Charset)
|
||||||
return result.Charset, err
|
return topResult.Charset, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -230,7 +230,11 @@ func TestDetectEncoding(t *testing.T) {
|
||||||
// we accept either.
|
// we accept either.
|
||||||
assert.Contains(t, encoding, "ISO-8859")
|
assert.Contains(t, encoding, "ISO-8859")
|
||||||
|
|
||||||
|
old := setting.Repository.AnsiCharset
|
||||||
setting.Repository.AnsiCharset = "placeholder"
|
setting.Repository.AnsiCharset = "placeholder"
|
||||||
|
defer func() {
|
||||||
|
setting.Repository.AnsiCharset = old
|
||||||
|
}()
|
||||||
testSuccess(b, "placeholder")
|
testSuccess(b, "placeholder")
|
||||||
|
|
||||||
// invalid bytes
|
// invalid bytes
|
||||||
|
|
|
@ -24,6 +24,8 @@ const (
|
||||||
// Repository settings
|
// Repository settings
|
||||||
var (
|
var (
|
||||||
Repository = struct {
|
Repository = struct {
|
||||||
|
DetectedCharsetsOrder []string
|
||||||
|
DetectedCharsetScore map[string]int `ini:"-"`
|
||||||
AnsiCharset string
|
AnsiCharset string
|
||||||
ForcePrivate bool
|
ForcePrivate bool
|
||||||
DefaultPrivate string
|
DefaultPrivate string
|
||||||
|
@ -88,6 +90,42 @@ var (
|
||||||
Wiki []string
|
Wiki []string
|
||||||
} `ini:"repository.signing"`
|
} `ini:"repository.signing"`
|
||||||
}{
|
}{
|
||||||
|
DetectedCharsetsOrder: []string{
|
||||||
|
"UTF-8",
|
||||||
|
"UTF-16BE",
|
||||||
|
"UTF-16LE",
|
||||||
|
"UTF-32BE",
|
||||||
|
"UTF-32LE",
|
||||||
|
"ISO-8859-1",
|
||||||
|
"windows-1252",
|
||||||
|
"ISO-8859-2",
|
||||||
|
"windows-1250",
|
||||||
|
"ISO-8859-5",
|
||||||
|
"ISO-8859-6",
|
||||||
|
"ISO-8859-7",
|
||||||
|
"windows-1253",
|
||||||
|
"ISO-8859-8-I",
|
||||||
|
"windows-1255",
|
||||||
|
"ISO-8859-8",
|
||||||
|
"windows-1251",
|
||||||
|
"windows-1256",
|
||||||
|
"KOI8-R",
|
||||||
|
"ISO-8859-9",
|
||||||
|
"windows-1254",
|
||||||
|
"Shift_JIS",
|
||||||
|
"GB18030",
|
||||||
|
"EUC-JP",
|
||||||
|
"EUC-KR",
|
||||||
|
"Big5",
|
||||||
|
"ISO-2022-JP",
|
||||||
|
"ISO-2022-KR",
|
||||||
|
"ISO-2022-CN",
|
||||||
|
"IBM424_rtl",
|
||||||
|
"IBM424_ltr",
|
||||||
|
"IBM420_rtl",
|
||||||
|
"IBM420_ltr",
|
||||||
|
},
|
||||||
|
DetectedCharsetScore: map[string]int{},
|
||||||
AnsiCharset: "",
|
AnsiCharset: "",
|
||||||
ForcePrivate: false,
|
ForcePrivate: false,
|
||||||
DefaultPrivate: RepoCreatingLastUserVisibility,
|
DefaultPrivate: RepoCreatingLastUserVisibility,
|
||||||
|
@ -208,6 +246,10 @@ func newRepository() {
|
||||||
} else {
|
} else {
|
||||||
RepoRootPath = filepath.Clean(RepoRootPath)
|
RepoRootPath = filepath.Clean(RepoRootPath)
|
||||||
}
|
}
|
||||||
|
defaultDetectedCharsetsOrder := make([]string, 0, len(Repository.DetectedCharsetsOrder))
|
||||||
|
for _, charset := range Repository.DetectedCharsetsOrder {
|
||||||
|
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder, strings.ToLower(strings.TrimSpace(charset)))
|
||||||
|
}
|
||||||
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")
|
ScriptType = sec.Key("SCRIPT_TYPE").MustString("bash")
|
||||||
|
|
||||||
if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
|
if err = Cfg.Section("repository").MapTo(&Repository); err != nil {
|
||||||
|
@ -222,6 +264,38 @@ func newRepository() {
|
||||||
log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
|
log.Fatal("Failed to map Repository.PullRequest settings: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
preferred := make([]string, 0, len(Repository.DetectedCharsetsOrder))
|
||||||
|
for _, charset := range Repository.DetectedCharsetsOrder {
|
||||||
|
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
|
||||||
|
preferred = append(preferred, canonicalCharset)
|
||||||
|
// remove it from the defaults
|
||||||
|
for i, charset := range defaultDetectedCharsetsOrder {
|
||||||
|
if charset == canonicalCharset {
|
||||||
|
defaultDetectedCharsetsOrder = append(defaultDetectedCharsetsOrder[:i], defaultDetectedCharsetsOrder[i+1:]...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
i := 0
|
||||||
|
for _, charset := range preferred {
|
||||||
|
// Add the defaults
|
||||||
|
if charset == "defaults" {
|
||||||
|
for _, charset := range defaultDetectedCharsetsOrder {
|
||||||
|
canonicalCharset := strings.ToLower(strings.TrimSpace(charset))
|
||||||
|
if _, has := Repository.DetectedCharsetScore[canonicalCharset]; !has {
|
||||||
|
Repository.DetectedCharsetScore[canonicalCharset] = i
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if _, has := Repository.DetectedCharsetScore[charset]; !has {
|
||||||
|
Repository.DetectedCharsetScore[charset] = i
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !filepath.IsAbs(Repository.Upload.TempPath) {
|
if !filepath.IsAbs(Repository.Upload.TempPath) {
|
||||||
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
|
Repository.Upload.TempPath = path.Join(AppWorkPath, Repository.Upload.TempPath)
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue