From ea4c139cd2f7e5174627a40aa8a9973fabf508ff Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Sat, 30 May 2020 10:46:15 +0300 Subject: [PATCH] Change language statistics to save size instead of percentage (#11681) * Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> --- models/migrations/migrations.go | 2 + models/migrations/v140.go | 56 ++++++++++++++ models/repo_language_stats.go | 102 +++++++++++++++++++++----- modules/git/repo_language_stats.go | 25 ++----- modules/indexer/stats/indexer_test.go | 3 + 5 files changed, 150 insertions(+), 38 deletions(-) create mode 100644 models/migrations/v140.go diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index 00d84da2e..869661aee 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -212,6 +212,8 @@ var migrations = []Migration{ NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), // v139 -> v140 NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), + // v140 -> v141 + NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize), } // GetCurrentDBVersion returns the current db version diff --git a/models/migrations/v140.go b/models/migrations/v140.go new file mode 100644 index 000000000..871d14b84 --- /dev/null +++ b/models/migrations/v140.go @@ -0,0 +1,56 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package migrations + +import ( + "fmt" + + "code.gitea.io/gitea/modules/setting" + + "xorm.io/xorm" +) + +func fixLanguageStatsToSaveSize(x *xorm.Engine) error { + // LanguageStat see models/repo_language_stats.go + type LanguageStat struct { + Size int64 `xorm:"NOT NULL DEFAULT 0"` + } + + // RepoIndexerType specifies the repository indexer type + type RepoIndexerType int + + const ( + // RepoIndexerTypeCode code indexer + RepoIndexerTypeCode RepoIndexerType = iota // 0 + // RepoIndexerTypeStats repository stats indexer + RepoIndexerTypeStats // 1 + ) + + // RepoIndexerStatus see models/repo_indexer.go + type RepoIndexerStatus struct { + IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` + } + + if err := x.Sync2(new(LanguageStat)); err != nil { + return fmt.Errorf("Sync2: %v", err) + } + + x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats}) + + // Delete language stat statuses + truncExpr := "TRUNCATE TABLE" + if setting.Database.UseSQLite3 { + truncExpr = "DELETE FROM" + } + + // Delete language stats + if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil { + return err + } + + sess := x.NewSession() + defer sess.Close() + return dropTableColumns(sess, "language_stat", "percentage") +} diff --git a/models/repo_language_stats.go b/models/repo_language_stats.go index 5f1aed1f3..d08782eaf 100644 --- a/models/repo_language_stats.go +++ b/models/repo_language_stats.go @@ -20,11 +20,28 @@ type LanguageStat struct { CommitID string IsPrimary bool Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` - Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` + Percentage float32 `xorm:"-"` + Size int64 `xorm:"NOT NULL DEFAULT 0"` Color string `xorm:"-"` CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` } +// specialLanguages defines list of languages that are excluded from the calculation +// unless they are the only language present in repository. Only languages which under +// normal circumstances are not considered to be code should be listed here. +var specialLanguages = map[string]struct{}{ + "XML": {}, + "JSON": {}, + "TOML": {}, + "YAML": {}, + "INI": {}, + "SQL": {}, + "SVG": {}, + "Text": {}, + "Markdown": {}, + "other": {}, +} + // LanguageStatList defines a list of language statistics type LanguageStatList []*LanguageStat @@ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() { } } +func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { + langPerc := make(map[string]float32) + var otherPerc float32 = 100 + var total int64 + // Check that repository has at least one non-special language + var skipSpecial bool + for _, stat := range stats { + if _, ok := specialLanguages[stat.Language]; !ok { + skipSpecial = true + break + } + } + for _, stat := range stats { + // Exclude specific languages from percentage calculation + if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { + continue + } + total += stat.Size + } + if total > 0 { + for _, stat := range stats { + // Exclude specific languages from percentage calculation + if _, ok := specialLanguages[stat.Language]; ok && skipSpecial { + continue + } + perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) + if perc <= 0.1 { + continue + } + otherPerc -= perc + langPerc[stat.Language] = perc + } + otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) + } else { + otherPerc = 100 + } + if otherPerc > 0 { + langPerc["other"] = otherPerc + } + return langPerc +} + func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { stats := make(LanguageStatList, 0, 6) - if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { + if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil { return nil, err } - stats.loadAttributes() return stats, nil } @@ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) if err != nil { return nil, err } + perc := stats.getLanguagePercentages() topstats := make(LanguageStatList, 0, limit) var other float32 for i := range stats { - if stats[i].Language == "other" || len(topstats) >= limit { - other += stats[i].Percentage + if _, ok := perc[stats[i].Language]; !ok { continue } + if stats[i].Language == "other" || len(topstats) >= limit { + other += perc[stats[i].Language] + continue + } + stats[i].Percentage = perc[stats[i].Language] topstats = append(topstats, stats[i]) } if other > 0 { @@ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) Percentage: float32(math.Round(float64(other)*10) / 10), }) } + topstats.loadAttributes() return topstats, nil } // UpdateLanguageStats updates the language statistics for repository -func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { +func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error { sess := x.NewSession() if err := sess.Begin(); err != nil { return err @@ -87,15 +151,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl return err } var topLang string - var p float32 - for lang, perc := range stats { - if perc > p { - p = perc + var s int64 + for lang, size := range stats { + if size > s { + s = size topLang = strings.ToLower(lang) } } - for lang, perc := range stats { + for lang, size := range stats { upd := false llang := strings.ToLower(lang) for _, s := range oldstats { @@ -103,8 +167,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl if strings.ToLower(s.Language) == llang { s.CommitID = commitID s.IsPrimary = llang == topLang - s.Percentage = perc - if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { + s.Size = size + if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil { return err } upd = true @@ -114,11 +178,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl // Insert new language if !upd { if _, err := sess.Insert(&LanguageStat{ - RepoID: repo.ID, - CommitID: commitID, - IsPrimary: llang == topLang, - Language: lang, - Percentage: perc, + RepoID: repo.ID, + CommitID: commitID, + IsPrimary: llang == topLang, + Language: lang, + Size: size, }); err != nil { return err } @@ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error { return err } RepoLang := make(LanguageStatList, 0, 6) - if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { + if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil { return err } if len(RepoLang) > 0 { diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index 8ff8fa20c..d623d6f57 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -8,7 +8,6 @@ import ( "bytes" "io" "io/ioutil" - "math" "code.gitea.io/gitea/modules/analyze" @@ -21,7 +20,7 @@ import ( const fileSizeLimit int64 = 16 * 1024 * 1024 // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { +func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e } sizes := make(map[string]int64) - var total int64 err = tree.Files().ForEach(func(f *object.File) error { if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { @@ -60,11 +58,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e language := analyze.GetCodeLanguage(f.Name, content) if language == enry.OtherLanguage || language == "" { - return nil + language = "other" } sizes[language] += f.Size - total += f.Size return nil }) @@ -72,21 +69,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e return nil, err } - stats := make(map[string]float32) - var otherPerc float32 = 100 - for language, size := range sizes { - perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) - if perc <= 0.1 { - continue - } - otherPerc -= perc - stats[language] = perc + if len(sizes) == 0 { + sizes["other"] = 0 } - otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) - if otherPerc > 0 { - stats["other"] = otherPerc - } - return stats, nil + + return sizes, nil } func readFile(f *object.File, limit int64) ([]byte, error) { diff --git a/modules/indexer/stats/indexer_test.go b/modules/indexer/stats/indexer_test.go index 29d0f6dbe..b60c6d9bb 100644 --- a/modules/indexer/stats/indexer_test.go +++ b/modules/indexer/stats/indexer_test.go @@ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) { repo, err := models.GetRepositoryByID(1) assert.NoError(t, err) + status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) + assert.NoError(t, err) + assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) langs, err := repo.GetTopLanguageStats(5) assert.NoError(t, err) assert.Len(t, langs, 1)