Change language statistics to save size instead of percentage (#11681)

* Change language statistics to save size instead of percentage in database

Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>

* Do not exclude if only language

* Fix edge cases with special langauges

Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>
This commit is contained in:
Lauris BH 2020-05-30 10:46:15 +03:00 committed by GitHub
parent 4395c607ed
commit ea4c139cd2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 150 additions and 38 deletions

View file

@ -212,6 +212,8 @@ var migrations = []Migration{
NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn),
// v139 -> v140 // v139 -> v140
NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs),
// v140 -> v141
NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize),
} }
// GetCurrentDBVersion returns the current db version // GetCurrentDBVersion returns the current db version

56
models/migrations/v140.go Normal file
View file

@ -0,0 +1,56 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package migrations
import (
"fmt"
"code.gitea.io/gitea/modules/setting"
"xorm.io/xorm"
)
func fixLanguageStatsToSaveSize(x *xorm.Engine) error {
// LanguageStat see models/repo_language_stats.go
type LanguageStat struct {
Size int64 `xorm:"NOT NULL DEFAULT 0"`
}
// RepoIndexerType specifies the repository indexer type
type RepoIndexerType int
const (
// RepoIndexerTypeCode code indexer
RepoIndexerTypeCode RepoIndexerType = iota // 0
// RepoIndexerTypeStats repository stats indexer
RepoIndexerTypeStats // 1
)
// RepoIndexerStatus see models/repo_indexer.go
type RepoIndexerStatus struct {
IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"`
}
if err := x.Sync2(new(LanguageStat)); err != nil {
return fmt.Errorf("Sync2: %v", err)
}
x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats})
// Delete language stat statuses
truncExpr := "TRUNCATE TABLE"
if setting.Database.UseSQLite3 {
truncExpr = "DELETE FROM"
}
// Delete language stats
if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil {
return err
}
sess := x.NewSession()
defer sess.Close()
return dropTableColumns(sess, "language_stat", "percentage")
}

View file

@ -20,11 +20,28 @@ type LanguageStat struct {
CommitID string CommitID string
IsPrimary bool IsPrimary bool
Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"`
Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` Percentage float32 `xorm:"-"`
Size int64 `xorm:"NOT NULL DEFAULT 0"`
Color string `xorm:"-"` Color string `xorm:"-"`
CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"`
} }
// specialLanguages defines list of languages that are excluded from the calculation
// unless they are the only language present in repository. Only languages which under
// normal circumstances are not considered to be code should be listed here.
var specialLanguages = map[string]struct{}{
"XML": {},
"JSON": {},
"TOML": {},
"YAML": {},
"INI": {},
"SQL": {},
"SVG": {},
"Text": {},
"Markdown": {},
"other": {},
}
// LanguageStatList defines a list of language statistics // LanguageStatList defines a list of language statistics
type LanguageStatList []*LanguageStat type LanguageStatList []*LanguageStat
@ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() {
} }
} }
func (stats LanguageStatList) getLanguagePercentages() map[string]float32 {
langPerc := make(map[string]float32)
var otherPerc float32 = 100
var total int64
// Check that repository has at least one non-special language
var skipSpecial bool
for _, stat := range stats {
if _, ok := specialLanguages[stat.Language]; !ok {
skipSpecial = true
break
}
}
for _, stat := range stats {
// Exclude specific languages from percentage calculation
if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
continue
}
total += stat.Size
}
if total > 0 {
for _, stat := range stats {
// Exclude specific languages from percentage calculation
if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
continue
}
perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10)
if perc <= 0.1 {
continue
}
otherPerc -= perc
langPerc[stat.Language] = perc
}
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
} else {
otherPerc = 100
}
if otherPerc > 0 {
langPerc["other"] = otherPerc
}
return langPerc
}
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) {
stats := make(LanguageStatList, 0, 6) stats := make(LanguageStatList, 0, 6)
if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil {
return nil, err return nil, err
} }
stats.loadAttributes()
return stats, nil return stats, nil
} }
@ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
if err != nil { if err != nil {
return nil, err return nil, err
} }
perc := stats.getLanguagePercentages()
topstats := make(LanguageStatList, 0, limit) topstats := make(LanguageStatList, 0, limit)
var other float32 var other float32
for i := range stats { for i := range stats {
if stats[i].Language == "other" || len(topstats) >= limit { if _, ok := perc[stats[i].Language]; !ok {
other += stats[i].Percentage
continue continue
} }
if stats[i].Language == "other" || len(topstats) >= limit {
other += perc[stats[i].Language]
continue
}
stats[i].Percentage = perc[stats[i].Language]
topstats = append(topstats, stats[i]) topstats = append(topstats, stats[i])
} }
if other > 0 { if other > 0 {
@ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
Percentage: float32(math.Round(float64(other)*10) / 10), Percentage: float32(math.Round(float64(other)*10) / 10),
}) })
} }
topstats.loadAttributes()
return topstats, nil return topstats, nil
} }
// UpdateLanguageStats updates the language statistics for repository // UpdateLanguageStats updates the language statistics for repository
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error {
sess := x.NewSession() sess := x.NewSession()
if err := sess.Begin(); err != nil { if err := sess.Begin(); err != nil {
return err return err
@ -87,15 +151,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
return err return err
} }
var topLang string var topLang string
var p float32 var s int64
for lang, perc := range stats { for lang, size := range stats {
if perc > p { if size > s {
p = perc s = size
topLang = strings.ToLower(lang) topLang = strings.ToLower(lang)
} }
} }
for lang, perc := range stats { for lang, size := range stats {
upd := false upd := false
llang := strings.ToLower(lang) llang := strings.ToLower(lang)
for _, s := range oldstats { for _, s := range oldstats {
@ -103,8 +167,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
if strings.ToLower(s.Language) == llang { if strings.ToLower(s.Language) == llang {
s.CommitID = commitID s.CommitID = commitID
s.IsPrimary = llang == topLang s.IsPrimary = llang == topLang
s.Percentage = perc s.Size = size
if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil {
return err return err
} }
upd = true upd = true
@ -114,11 +178,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
// Insert new language // Insert new language
if !upd { if !upd {
if _, err := sess.Insert(&LanguageStat{ if _, err := sess.Insert(&LanguageStat{
RepoID: repo.ID, RepoID: repo.ID,
CommitID: commitID, CommitID: commitID,
IsPrimary: llang == topLang, IsPrimary: llang == topLang,
Language: lang, Language: lang,
Percentage: perc, Size: size,
}); err != nil { }); err != nil {
return err return err
} }
@ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error {
return err return err
} }
RepoLang := make(LanguageStatList, 0, 6) RepoLang := make(LanguageStatList, 0, 6)
if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil {
return err return err
} }
if len(RepoLang) > 0 { if len(RepoLang) > 0 {

View file

@ -8,7 +8,6 @@ import (
"bytes" "bytes"
"io" "io"
"io/ioutil" "io/ioutil"
"math"
"code.gitea.io/gitea/modules/analyze" "code.gitea.io/gitea/modules/analyze"
@ -21,7 +20,7 @@ import (
const fileSizeLimit int64 = 16 * 1024 * 1024 const fileSizeLimit int64 = 16 * 1024 * 1024
// GetLanguageStats calculates language stats for git repository at specified commit // GetLanguageStats calculates language stats for git repository at specified commit
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
r, err := git.PlainOpen(repo.Path) r, err := git.PlainOpen(repo.Path)
if err != nil { if err != nil {
return nil, err return nil, err
@ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
} }
sizes := make(map[string]int64) sizes := make(map[string]int64)
var total int64
err = tree.Files().ForEach(func(f *object.File) error { err = tree.Files().ForEach(func(f *object.File) error {
if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
@ -60,11 +58,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
language := analyze.GetCodeLanguage(f.Name, content) language := analyze.GetCodeLanguage(f.Name, content)
if language == enry.OtherLanguage || language == "" { if language == enry.OtherLanguage || language == "" {
return nil language = "other"
} }
sizes[language] += f.Size sizes[language] += f.Size
total += f.Size
return nil return nil
}) })
@ -72,21 +69,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
return nil, err return nil, err
} }
stats := make(map[string]float32) if len(sizes) == 0 {
var otherPerc float32 = 100 sizes["other"] = 0
for language, size := range sizes {
perc := float32(math.Round(float64(size)/float64(total)*1000) / 10)
if perc <= 0.1 {
continue
}
otherPerc -= perc
stats[language] = perc
} }
otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
if otherPerc > 0 { return sizes, nil
stats["other"] = otherPerc
}
return stats, nil
} }
func readFile(f *object.File, limit int64) ([]byte, error) { func readFile(f *object.File, limit int64) ([]byte, error) {

View file

@ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) {
repo, err := models.GetRepositoryByID(1) repo, err := models.GetRepositoryByID(1)
assert.NoError(t, err) assert.NoError(t, err)
status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats)
assert.NoError(t, err)
assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha)
langs, err := repo.GetTopLanguageStats(5) langs, err := repo.GetTopLanguageStats(5)
assert.NoError(t, err) assert.NoError(t, err)
assert.Len(t, langs, 1) assert.Len(t, langs, 1)