Markdown: Sanitizier Configuration (#9075)

* Support custom sanitization policy

Allowing the gitea administrator to configure sanitization policy allows
them to couple external renders and custom templates to support more
markup. In particular, the `pandoc` renderer allows generating KaTeX
annotations, wrapping them in `<span>` elements with class `math` and
either `inline` or `display` (depending on whether or not inline or
block mode was requested).

This iteration gives the administrator whitelisting powers; carefully
crafted regexes will thus let through only the desired attributes
necessary to support their custom markup.

Resolves: #9054

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Document new sanitization configuration

 - Adds basic documentation to app.ini.sample,
 - Adds an example to the Configuration Cheat Sheet, and
 - Adds extended information to External Renderers section.

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Drop extraneous length check in newMarkupSanitizer(...)

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>

* Fix plural ELEMENT and ALLOW_ATTR in docs

These were left over from their initial names. Make them singular to
conform with the current expectations.

Signed-off-by: Alexander Scheel <alexander.m.scheel@gmail.com>
This commit is contained in:
Alexander Scheel 2019-12-07 14:49:04 -05:00 committed by techknowlogick
parent cecc31951c
commit ee7df7ba8c
5 changed files with 155 additions and 29 deletions

View file

@ -877,6 +877,12 @@ SHOW_FOOTER_VERSION = true
; Show template execution time in the footer ; Show template execution time in the footer
SHOW_FOOTER_TEMPLATE_LOAD_TIME = true SHOW_FOOTER_TEMPLATE_LOAD_TIME = true
[markup.sanitizer]
; The following keys can be used multiple times to define sanitation policy rules.
;ELEMENT = span
;ALLOW_ATTR = class
;REGEXP = ^(info|warning|error)$
[markup.asciidoc] [markup.asciidoc]
ENABLED = false ENABLED = false
; List of file extensions that should be rendered by an external command ; List of file extensions that should be rendered by an external command

View file

@ -578,6 +578,24 @@ Two special environment variables are passed to the render command:
- `GITEA_PREFIX_SRC`, which contains the current URL prefix in the `src` path tree. To be used as prefix for links. - `GITEA_PREFIX_SRC`, which contains the current URL prefix in the `src` path tree. To be used as prefix for links.
- `GITEA_PREFIX_RAW`, which contains the current URL prefix in the `raw` path tree. To be used as prefix for image paths. - `GITEA_PREFIX_RAW`, which contains the current URL prefix in the `raw` path tree. To be used as prefix for image paths.
Gitea supports customizing the sanitization policy for rendered HTML. The example below will support KaTeX output from pandoc.
```ini
[markup.sanitizer]
; Pandoc renders TeX segments as <span>s with the "math" class, optionally
; with "inline" or "display" classes depending on context.
ELEMENT = span
ALLOW_ATTR = class
REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+
```
- `ELEMENT`: The element this policy applies to. Must be non-empty.
- `ALLOW_ATTR`: The attribute this policy allows. Must be non-empty.
- `REGEXP`: A regex to match the contents of the attribute against. Must be present but may be empty for unconditional whitelisting of this attribute.
You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry.
## Time (`time`) ## Time (`time`)
- `FORMAT`: Time format to diplay on UI. i.e. RFC1123 or 2006-01-02 15:04:05 - `FORMAT`: Time format to diplay on UI. i.e. RFC1123 or 2006-01-02 15:04:05

View file

@ -68,4 +68,22 @@ RENDER_COMMAND = rst2html.py
IS_INPUT_FILE = false IS_INPUT_FILE = false
``` ```
If your external markup relies on additional classes and attributes on the generated HTML elements, you might need to enable custom sanitizer policies. Gitea uses the [`bluemonday`](https://godoc.org/github.com/microcosm-cc/bluemonday) package as our HTML sanitizier. The example below will support [KaTeX](https://katex.org/) output from [`pandoc`](https://pandoc.org/).
```ini
[markup.sanitizer]
; Pandoc renders TeX segments as <span>s with the "math" class, optionally
; with "inline" or "display" classes depending on context.
ELEMENT = span
ALLOW_ATTR = class
REGEXP = ^\s*((math(\s+|$)|inline(\s+|$)|display(\s+|$)))+
[markup.markdown]
ENABLED = true
FILE_EXTENSIONS = .md,.markdown
RENDER_COMMAND = pandoc -f markdown -t html --katex
```
You may redefine `ELEMENT`, `ALLOW_ATTR`, and `REGEXP` multiple times; each time all three are defined is a single policy entry. All three must be defined, but `REGEXP` may be blank to allow unconditional whitelisting of that attribute.
Once your configuration changes have been made, restart Gitea to have changes take effect. Once your configuration changes have been made, restart Gitea to have changes take effect.

View file

@ -50,6 +50,15 @@ func ReplaceSanitizer() {
// Allow <kbd> tags for keyboard shortcut styling // Allow <kbd> tags for keyboard shortcut styling
sanitizer.policy.AllowElements("kbd") sanitizer.policy.AllowElements("kbd")
// Custom keyword markup
for _, rule := range setting.ExternalSanitizerRules {
if rule.Regexp != nil {
sanitizer.policy.AllowAttrs(rule.AllowAttr).Matching(rule.Regexp).OnElements(rule.Element)
} else {
sanitizer.policy.AllowAttrs(rule.AllowAttr).OnElements(rule.Element)
}
}
} }
// Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist. // Sanitize takes a string that contains a HTML fragment or document and applies policy whitelist.

View file

@ -9,11 +9,14 @@ import (
"strings" "strings"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"gopkg.in/ini.v1"
) )
// ExternalMarkupParsers represents the external markup parsers // ExternalMarkupParsers represents the external markup parsers
var ( var (
ExternalMarkupParsers []MarkupParser ExternalMarkupParsers []MarkupParser
ExternalSanitizerRules []MarkupSanitizerRule
) )
// MarkupParser defines the external parser configured in ini // MarkupParser defines the external parser configured in ini
@ -25,8 +28,15 @@ type MarkupParser struct {
IsInputFile bool IsInputFile bool
} }
// MarkupSanitizerRule defines the policy for whitelisting attributes on
// certain elements.
type MarkupSanitizerRule struct {
Element string
AllowAttr string
Regexp *regexp.Regexp
}
func newMarkup() { func newMarkup() {
extensionReg := regexp.MustCompile(`\.\w`)
for _, sec := range Cfg.Section("markup").ChildSections() { for _, sec := range Cfg.Section("markup").ChildSections() {
name := strings.TrimPrefix(sec.Name(), "markup.") name := strings.TrimPrefix(sec.Name(), "markup.")
if name == "" { if name == "" {
@ -34,6 +44,72 @@ func newMarkup() {
continue continue
} }
if name == "sanitizer" {
newMarkupSanitizer(name, sec)
} else {
newMarkupRenderer(name, sec)
}
}
}
func newMarkupSanitizer(name string, sec *ini.Section) {
haveElement := sec.HasKey("ELEMENT")
haveAttr := sec.HasKey("ALLOW_ATTR")
haveRegexp := sec.HasKey("REGEXP")
if !haveElement && !haveAttr && !haveRegexp {
log.Warn("Skipping empty section: markup.%s.", name)
return
}
if !haveElement || !haveAttr || !haveRegexp {
log.Error("Missing required keys from markup.%s. Must have all three of ELEMENT, ALLOW_ATTR, and REGEXP defined!", name)
return
}
elements := sec.Key("ELEMENT").ValueWithShadows()
allowAttrs := sec.Key("ALLOW_ATTR").ValueWithShadows()
regexps := sec.Key("REGEXP").ValueWithShadows()
if len(elements) != len(allowAttrs) ||
len(elements) != len(regexps) {
log.Error("All three keys in markup.%s (ELEMENT, ALLOW_ATTR, REGEXP) must be defined the same number of times! Got %d, %d, and %d respectively.", name, len(elements), len(allowAttrs), len(regexps))
return
}
ExternalSanitizerRules = make([]MarkupSanitizerRule, 0, len(elements))
for index, pattern := range regexps {
if pattern == "" {
rule := MarkupSanitizerRule{
Element: elements[index],
AllowAttr: allowAttrs[index],
Regexp: nil,
}
ExternalSanitizerRules = append(ExternalSanitizerRules, rule)
continue
}
// Validate when parsing the config that this is a valid regular
// expression. Then we can use regexp.MustCompile(...) later.
compiled, err := regexp.Compile(pattern)
if err != nil {
log.Error("In module.%s: REGEXP at definition %d failed to compile: %v", name, index+1, err)
continue
}
rule := MarkupSanitizerRule{
Element: elements[index],
AllowAttr: allowAttrs[index],
Regexp: compiled,
}
ExternalSanitizerRules = append(ExternalSanitizerRules, rule)
}
}
func newMarkupRenderer(name string, sec *ini.Section) {
extensionReg := regexp.MustCompile(`\.\w`)
extensions := sec.Key("FILE_EXTENSIONS").Strings(",") extensions := sec.Key("FILE_EXTENSIONS").Strings(",")
var exts = make([]string, 0, len(extensions)) var exts = make([]string, 0, len(extensions))
for _, extension := range extensions { for _, extension := range extensions {
@ -46,13 +122,13 @@ func newMarkup() {
if len(exts) == 0 { if len(exts) == 0 {
log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored") log.Warn(sec.Name() + " file extension is empty, markup " + name + " ignored")
continue return
} }
command := sec.Key("RENDER_COMMAND").MustString("") command := sec.Key("RENDER_COMMAND").MustString("")
if command == "" { if command == "" {
log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored") log.Warn(" RENDER_COMMAND is empty, markup " + name + " ignored")
continue return
} }
ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{ ExternalMarkupParsers = append(ExternalMarkupParsers, MarkupParser{
@ -62,5 +138,4 @@ func newMarkup() {
Command: command, Command: command,
IsInputFile: sec.Key("IS_INPUT_FILE").MustBool(false), IsInputFile: sec.Key("IS_INPUT_FILE").MustBool(false),
}) })
}
} }