Update emoji regex (#11584) (#11679)

When matching emoji, use a regex built from the data we have instead of something generic using unicode ranges. A generic regex can't tell the difference between two separate emoji next to each other or one emoji that is built out of two separate emoji next to each other.

This means that emoji that are next to each other without space in between will be now accurately spanned individually with proper title etc...
This commit is contained in:
mrsdizzie 2020-05-29 17:12:53 -04:00 committed by GitHub
parent 6ad0d0a1b9
commit 0e44fab5d6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 7 deletions

View file

@ -6,8 +6,10 @@
package emoji package emoji
import ( import (
"sort"
"strings" "strings"
"sync" "sync"
"unicode/utf8"
) )
// Gemoji is a set of emoji data. // Gemoji is a set of emoji data.
@ -48,6 +50,12 @@ func loadMap() {
// process emoji codes and aliases // process emoji codes and aliases
codePairs := make([]string, 0) codePairs := make([]string, 0)
aliasPairs := make([]string, 0) aliasPairs := make([]string, 0)
// sort from largest to small so we match combined emoji first
sort.Slice(GemojiData, func(i, j int) bool {
return len(GemojiData[i].Emoji) > len(GemojiData[j].Emoji)
})
for i, e := range GemojiData { for i, e := range GemojiData {
if e.Emoji == "" || len(e.Aliases) == 0 { if e.Emoji == "" || len(e.Aliases) == 0 {
continue continue
@ -72,6 +80,7 @@ func loadMap() {
codeReplacer = strings.NewReplacer(codePairs...) codeReplacer = strings.NewReplacer(codePairs...)
aliasReplacer = strings.NewReplacer(aliasPairs...) aliasReplacer = strings.NewReplacer(aliasPairs...)
}) })
} }
// FromCode retrieves the emoji data based on the provided unicode code (ie, // FromCode retrieves the emoji data based on the provided unicode code (ie,
@ -117,3 +126,21 @@ func ReplaceAliases(s string) string {
loadMap() loadMap()
return aliasReplacer.Replace(s) return aliasReplacer.Replace(s)
} }
// FindEmojiSubmatchIndex returns index pair of longest emoji in a string
func FindEmojiSubmatchIndex(s string) []int {
loadMap()
// if rune and string length are the same then no emoji will be present
// similar performance when there is unicode present but almost 200% faster when not
if utf8.RuneCountInString(s) == len(s) {
return nil
}
for j := range GemojiData {
i := strings.Index(s, GemojiData[j].Emoji)
if i != -1 {
return []int{i, i + len(GemojiData[j].Emoji)}
}
}
return nil
}

View file

@ -65,10 +65,6 @@ var (
// EmojiShortCodeRegex find emoji by alias like :smile: // EmojiShortCodeRegex find emoji by alias like :smile:
EmojiShortCodeRegex = regexp.MustCompile(`\:[\w\+\-]+\:{1}`) EmojiShortCodeRegex = regexp.MustCompile(`\:[\w\+\-]+\:{1}`)
// find emoji literal: search all emoji hex range as many times as they appear as
// some emojis (skin color etc..) are just two or more chained together
emojiRegex = regexp.MustCompile(`[\x{1F000}-\x{1FFFF}|\x{2000}-\x{32ff}|\x{fe4e5}-\x{fe4ee}|\x{200D}|\x{FE0F}|\x{e0000}-\x{e007f}]+`)
) )
// CSS class for action keywords (e.g. "closes: #1") // CSS class for action keywords (e.g. "closes: #1")
@ -922,8 +918,7 @@ func emojiShortCodeProcessor(ctx *postProcessCtx, node *html.Node) {
// emoji processor to match emoji and add emoji class // emoji processor to match emoji and add emoji class
func emojiProcessor(ctx *postProcessCtx, node *html.Node) { func emojiProcessor(ctx *postProcessCtx, node *html.Node) {
m := emojiRegex.FindStringSubmatchIndex(node.Data) m := emoji.FindEmojiSubmatchIndex(node.Data)
if m == nil { if m == nil {
return return
} }

View file

@ -263,7 +263,9 @@ func TestRender_emoji(t *testing.T) {
test( test(
"Some text with :smile: in the middle", "Some text with :smile: in the middle",
`<p>Some text with <span class="emoji" aria-label="grinning face with smiling eyes">😄</span> in the middle</p>`) `<p>Some text with <span class="emoji" aria-label="grinning face with smiling eyes">😄</span> in the middle</p>`)
test(
"Some text with 😄😄 2 emoji next to each other",
`<p>Some text with <span class="emoji" aria-label="grinning face with smiling eyes">😄</span><span class="emoji" aria-label="grinning face with smiling eyes">😄</span> 2 emoji next to each other</p>`)
// should match nothing // should match nothing
test( test(
"2001:0db8:85a3:0000:0000:8a2e:0370:7334", "2001:0db8:85a3:0000:0000:8a2e:0370:7334",