From 09c667eb45e0327607674c93e08e769b3349c7ce Mon Sep 17 00:00:00 2001 From: Jason Song Date: Tue, 3 Jan 2023 22:03:56 +0800 Subject: [PATCH] Fix sitemap (#22272) (#22320) Backport #22272. Fix #22270. Related to #18407. The old code treated both sitemap and sitemap index as the format like: ```xml ... http://localhost:3000/explore/users/sitemap-1.xml ... ``` Actually, it's incorrect for sitemap index, it should be: ```xml ... http://localhost:3000/explore/users/sitemap-1.xml ... ``` See https://www.sitemaps.org/protocol.html Co-authored-by: Lauris BH Co-authored-by: delvh Co-authored-by: Lunny Xiao --- modules/sitemap/sitemap.go | 44 +++++--- modules/sitemap/sitemap_test.go | 194 +++++++++++++++++++++++--------- 2 files changed, 171 insertions(+), 67 deletions(-) diff --git a/modules/sitemap/sitemap.go b/modules/sitemap/sitemap.go index 14953765a..4654c5fa3 100644 --- a/modules/sitemap/sitemap.go +++ b/modules/sitemap/sitemap.go @@ -12,48 +12,62 @@ import ( "time" ) -// sitemapFileLimit contains the maximum size of a sitemap file -const sitemapFileLimit = 50 * 1024 * 1024 +const ( + sitemapFileLimit = 50 * 1024 * 1024 // the maximum size of a sitemap file + urlsLimit = 50000 -// Url represents a single sitemap entry + schemaURL = "http://www.sitemaps.org/schemas/sitemap/0.9" + urlsetName = "urlset" + sitemapindexName = "sitemapindex" +) + +// URL represents a single sitemap entry type URL struct { URL string `xml:"loc"` LastMod *time.Time `xml:"lastmod,omitempty"` } -// SitemapUrl represents a sitemap +// Sitemap represents a sitemap type Sitemap struct { XMLName xml.Name Namespace string `xml:"xmlns,attr"` - URLs []URL `xml:"url"` + URLs []URL `xml:"url"` + Sitemaps []URL `xml:"sitemap"` } // NewSitemap creates a sitemap func NewSitemap() *Sitemap { return &Sitemap{ - XMLName: xml.Name{Local: "urlset"}, - Namespace: "http://www.sitemaps.org/schemas/sitemap/0.9", + XMLName: xml.Name{Local: urlsetName}, + Namespace: schemaURL, } } -// NewSitemap creates a sitemap index. +// NewSitemapIndex creates a sitemap index. func NewSitemapIndex() *Sitemap { return &Sitemap{ - XMLName: xml.Name{Local: "sitemapindex"}, - Namespace: "http://www.sitemaps.org/schemas/sitemap/0.9", + XMLName: xml.Name{Local: sitemapindexName}, + Namespace: schemaURL, } } // Add adds a URL to the sitemap func (s *Sitemap) Add(u URL) { - s.URLs = append(s.URLs, u) + if s.XMLName.Local == sitemapindexName { + s.Sitemaps = append(s.Sitemaps, u) + } else { + s.URLs = append(s.URLs, u) + } } -// Write writes the sitemap to a response +// WriteTo writes the sitemap to a response func (s *Sitemap) WriteTo(w io.Writer) (int64, error) { - if len(s.URLs) > 50000 { - return 0, fmt.Errorf("The sitemap contains too many URLs: %d", len(s.URLs)) + if l := len(s.URLs); l > urlsLimit { + return 0, fmt.Errorf("The sitemap contains %d URLs, but only %d are allowed", l, urlsLimit) + } + if l := len(s.Sitemaps); l > urlsLimit { + return 0, fmt.Errorf("The sitemap contains %d sub-sitemaps, but only %d are allowed", l, urlsLimit) } buf := bytes.NewBufferString(xml.Header) if err := xml.NewEncoder(buf).Encode(s); err != nil { @@ -63,7 +77,7 @@ func (s *Sitemap) WriteTo(w io.Writer) (int64, error) { return 0, err } if buf.Len() > sitemapFileLimit { - return 0, fmt.Errorf("The sitemap is too big: %d", buf.Len()) + return 0, fmt.Errorf("The sitemap has %d bytes, but only %d are allowed", buf.Len(), sitemapFileLimit) } return buf.WriteTo(w) } diff --git a/modules/sitemap/sitemap_test.go b/modules/sitemap/sitemap_test.go index 63007b847..c7c9cbcd9 100644 --- a/modules/sitemap/sitemap_test.go +++ b/modules/sitemap/sitemap_test.go @@ -7,7 +7,6 @@ package sitemap import ( "bytes" "encoding/xml" - "fmt" "strings" "testing" "time" @@ -15,63 +14,154 @@ import ( "github.com/stretchr/testify/assert" ) -func TestOk(t *testing.T) { - testReal := func(s *Sitemap, name string, urls []URL, expected string) { - for _, url := range urls { - s.Add(url) - } - buf := &bytes.Buffer{} - _, err := s.WriteTo(buf) - assert.NoError(t, nil, err) - assert.Equal(t, xml.Header+"<"+name+" xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">"+expected+"\n", buf.String()) - } - test := func(urls []URL, expected string) { - testReal(NewSitemap(), "urlset", urls, expected) - testReal(NewSitemapIndex(), "sitemapindex", urls, expected) - } - +func TestNewSitemap(t *testing.T) { ts := time.Unix(1651322008, 0).UTC() - test( - []URL{}, - "", - ) - test( - []URL{ - {URL: "https://gitea.io/test1", LastMod: &ts}, + tests := []struct { + name string + urls []URL + want string + wantErr string + }{ + { + name: "empty", + urls: []URL{}, + want: xml.Header + `` + + "" + + "\n", }, - "https://gitea.io/test12022-04-30T12:33:28Z", - ) - test( - []URL{ - {URL: "https://gitea.io/test2", LastMod: nil}, + { + name: "regular", + urls: []URL{ + {URL: "https://gitea.io/test1", LastMod: &ts}, + }, + want: xml.Header + `` + + "https://gitea.io/test12022-04-30T12:33:28Z" + + "\n", }, - "https://gitea.io/test2", - ) - test( - []URL{ - {URL: "https://gitea.io/test1", LastMod: &ts}, - {URL: "https://gitea.io/test2", LastMod: nil}, + { + name: "without lastmod", + urls: []URL{ + {URL: "https://gitea.io/test1"}, + }, + want: xml.Header + `` + + "https://gitea.io/test1" + + "\n", + }, + { + name: "multiple", + urls: []URL{ + {URL: "https://gitea.io/test1", LastMod: &ts}, + {URL: "https://gitea.io/test2", LastMod: nil}, + }, + want: xml.Header + `` + + "https://gitea.io/test12022-04-30T12:33:28Z" + + "https://gitea.io/test2" + + "\n", + }, + { + name: "too many urls", + urls: make([]URL, 50001), + wantErr: "The sitemap contains 50001 URLs, but only 50000 are allowed", + }, + { + name: "too big file", + urls: []URL{ + {URL: strings.Repeat("b", 50*1024*1024+1)}, + }, + wantErr: "The sitemap has 52428932 bytes, but only 52428800 are allowed", }, - "https://gitea.io/test12022-04-30T12:33:28Z"+ - "https://gitea.io/test2", - ) -} - -func TestTooManyURLs(t *testing.T) { - s := NewSitemap() - for i := 0; i < 50001; i++ { - s.Add(URL{URL: fmt.Sprintf("https://gitea.io/test%d", i)}) } - buf := &bytes.Buffer{} - _, err := s.WriteTo(buf) - assert.EqualError(t, err, "The sitemap contains too many URLs: 50001") + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := NewSitemap() + for _, url := range tt.urls { + s.Add(url) + } + buf := &bytes.Buffer{} + _, err := s.WriteTo(buf) + if tt.wantErr != "" { + assert.EqualError(t, err, tt.wantErr) + } else { + assert.NoError(t, err) + assert.Equalf(t, tt.want, buf.String(), "NewSitemap()") + } + }) + } } -func TestSitemapTooBig(t *testing.T) { - s := NewSitemap() - s.Add(URL{URL: strings.Repeat("b", sitemapFileLimit)}) - buf := &bytes.Buffer{} - _, err := s.WriteTo(buf) - assert.EqualError(t, err, "The sitemap is too big: 52428931") +func TestNewSitemapIndex(t *testing.T) { + ts := time.Unix(1651322008, 0).UTC() + + tests := []struct { + name string + urls []URL + want string + wantErr string + }{ + { + name: "empty", + urls: []URL{}, + want: xml.Header + `` + + "" + + "\n", + }, + { + name: "regular", + urls: []URL{ + {URL: "https://gitea.io/test1", LastMod: &ts}, + }, + want: xml.Header + `` + + "https://gitea.io/test12022-04-30T12:33:28Z" + + "\n", + }, + { + name: "without lastmod", + urls: []URL{ + {URL: "https://gitea.io/test1"}, + }, + want: xml.Header + `` + + "https://gitea.io/test1" + + "\n", + }, + { + name: "multiple", + urls: []URL{ + {URL: "https://gitea.io/test1", LastMod: &ts}, + {URL: "https://gitea.io/test2", LastMod: nil}, + }, + want: xml.Header + `` + + "https://gitea.io/test12022-04-30T12:33:28Z" + + "https://gitea.io/test2" + + "\n", + }, + { + name: "too many sitemaps", + urls: make([]URL, 50001), + wantErr: "The sitemap contains 50001 sub-sitemaps, but only 50000 are allowed", + }, + { + name: "too big file", + urls: []URL{ + {URL: strings.Repeat("b", 50*1024*1024+1)}, + }, + wantErr: "The sitemap has 52428952 bytes, but only 52428800 are allowed", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + s := NewSitemapIndex() + for _, url := range tt.urls { + s.Add(url) + } + buf := &bytes.Buffer{} + _, err := s.WriteTo(buf) + if tt.wantErr != "" { + assert.EqualError(t, err, tt.wantErr) + } else { + assert.NoError(t, err) + assert.Equalf(t, tt.want, buf.String(), "NewSitemapIndex()") + } + }) + } }