From fb81fa7c2982d4084c894290c6b91cf1cdbe611b Mon Sep 17 00:00:00 2001 From: a1012112796 <1012112796@qq.com> Date: Mon, 1 Mar 2021 00:52:01 +0800 Subject: [PATCH 1/4] get language from .gitattributes file rules: linguist-language= attribute to an lang linguist-vendored attribute to vendor or un-vendor path ref: https://stackoverflow.com/questions/40659265/using-gitattributes-for-linguist-examples Signed-off-by: a1012112796 <1012112796@qq.com> --- modules/git/repo_attribute.go | 160 +++++++++++++++++++++ modules/git/repo_attribute_test.go | 80 +++++++++++ modules/git/repo_language_stats_gogit.go | 15 +- modules/git/repo_language_stats_nogogit.go | 22 ++- modules/indexer/stats/db.go | 28 +++- 5 files changed, 296 insertions(+), 9 deletions(-) create mode 100644 modules/git/repo_attribute_test.go diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index aa5e4c10e70d8..ee303826fde9d 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -5,8 +5,13 @@ package git import ( + "bufio" "bytes" "fmt" + "io" + "strings" + + "github.com/gobwas/glob" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -80,3 +85,158 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ return name2attribute2info, nil } + +// AttrCheckResultType result type of AttrCheckResult +type AttrCheckResultType int + +const ( + // AttrCheckResultTypeUnspecified the attribute is not defined for the path + AttrCheckResultTypeUnspecified AttrCheckResultType = iota + // AttrCheckResultTypeUnset the attribute is defined as false + AttrCheckResultTypeUnset + // AttrCheckResultTypeSet the attribute is defined as true + AttrCheckResultTypeSet + // AttrCheckResultTypeValue a value has been assigned to the attribute + AttrCheckResultTypeValue +) + +// AttrCheckResult the result of CheckAttributeFile +type AttrCheckResult struct { + typ AttrCheckResultType + data string +} + +// IsSet if the attribute is defined as true +func (r *AttrCheckResult) IsSet() bool { + return r.typ == AttrCheckResultTypeSet +} + +// Value get the value of AttrCheckResult +func (r *AttrCheckResult) Value() string { + if r.typ != AttrCheckResultTypeValue { + return "" + } + return r.data +} + +// AttrChecker Attribute checker +// format attr: partens +type AttrChecker map[string][]*attrCheckerItem + +type attrCheckerItem struct { + pattern glob.Glob + rs *AttrCheckResult +} + +// LoadAttrbutCheckerFromCommit load AttrChecker from a commit +func LoadAttrbutCheckerFromCommit(commit *Commit) (AttrChecker, error) { + gitAttrEntry, err := commit.GetTreeEntryByPath("/.gitattributes") + if err != nil { + if !IsErrNotExist(err) { + return nil, err + } + return nil, nil + } + if gitAttrEntry.IsDir() { + return nil, nil + } + + blob := gitAttrEntry.Blob() + dataRc, err := blob.DataAsync() + if err != nil { + return nil, err + } + defer dataRc.Close() + gitAttr := make([]byte, 1024) + n, _ := dataRc.Read(gitAttr) + gitAttr = gitAttr[:n] + + return LoadAttrbutCheckerFromReader(bytes.NewReader(gitAttr)) +} + +// LoadAttrbutCheckerFromReader load AttrChecker from content reader +func LoadAttrbutCheckerFromReader(r io.Reader) (AttrChecker, error) { + cheker := make(AttrChecker) + + readr := bufio.NewScanner(r) + for readr.Scan() { + t := readr.Text() + // format: pattern attr1 attr2 ... + if len(t) == 0 { + continue + } + + splits := strings.Split(t, " ") + if len(splits) < 2 { + continue + } + + // to let `/AAA/*.txt` can match `AAA/bb.txt`, have to + // remove first / if exit + splits[0] = strings.TrimPrefix(splits[0], "/") + + // get parten + g, err := glob.Compile(splits[0], '/') + if err != nil { + return nil, err + } + + check := func(attr string) (string, *AttrCheckResult) { + // one attr may has three status: + // set: XXX + // unset: -XXX + // value: XXX=VVV + if kv := strings.SplitN(attr, "=", 2); len(kv) == 2 { + return kv[0], &AttrCheckResult{ + typ: AttrCheckResultTypeValue, + data: kv[1], + } + } + typ := AttrCheckResultTypeSet + if strings.HasPrefix(attr, "-") { + attr = attr[1:] + typ = AttrCheckResultTypeUnset + } + return attr, &AttrCheckResult{typ: typ} + } + + // check attrs + attrs := splits[1:] + for _, tmp := range attrs { + attr, rs := check(tmp) + v, ok := cheker[attr] + if !ok { + v = make([]*attrCheckerItem, 0, 5) + } + + v = append(v, &attrCheckerItem{ + pattern: g, + rs: rs, + }) + cheker[attr] = v + } + } + + return cheker, nil +} + +// Check check an git attr +func (c AttrChecker) Check(requestAttr, path string) *AttrCheckResult { + if c == nil { + return nil + } + + v, ok := c[requestAttr] + if !ok { + return &AttrCheckResult{typ: AttrCheckResultTypeUnspecified} + } + + for _, item := range v { + if !item.pattern.Match(path) { + continue + } + return item.rs + } + + return &AttrCheckResult{typ: AttrCheckResultTypeUnspecified} +} diff --git a/modules/git/repo_attribute_test.go b/modules/git/repo_attribute_test.go new file mode 100644 index 0000000000000..d48880e8e4494 --- /dev/null +++ b/modules/git/repo_attribute_test.go @@ -0,0 +1,80 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package git + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestCheckAttributeFile(t *testing.T) { + testContent := `*.txt text eol=lf +/vendor/** -text -eol linguist-vendored +/tools/**/*.py linguist-vendored +` + r := strings.NewReader(testContent) + checker, err := LoadAttrbutCheckerFromReader(r) + if !assert.NoError(t, err) { + return + } + if !assert.NotEmpty(t, checker) { + return + } + + tests := []struct { + want *AttrCheckResult + content string + requestAttr string + path string + }{ + { + want: &AttrCheckResult{ + typ: AttrCheckResultTypeValue, + data: "lf", + }, + path: "aa.txt", + requestAttr: "eol", + }, + { + want: &AttrCheckResult{ + typ: AttrCheckResultTypeUnset, + data: "", + }, + path: "vendor/aa.txt", + requestAttr: "eol", + }, + { + want: &AttrCheckResult{ + typ: AttrCheckResultTypeUnspecified, + data: "", + }, + path: "aa.png", + requestAttr: "text", + }, + { + want: &AttrCheckResult{ + typ: AttrCheckResultTypeSet, + data: "", + }, + path: "vendor/bbb/aa.json", + requestAttr: "linguist-vendored", + }, + // TODO: glob tools/**/*.py should match it, but can't ... + // { + // want: &AttrCheckResult{ + // typ: AttrCheckResultTypeSet, + // data: "", + // }, + // path: "tools/aa.py", + // requestAttr: "linguist-vendored", + // }, + } + for _, tt := range tests { + got := checker.Check(tt.requestAttr, tt.path) + assert.Equal(t, tt.want, got) + } +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index b5a235921c8ae..36fd51aee4176 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -20,7 +20,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func (repo *Repository) GetLanguageStats(commitID string, preCheck func(path string) (string, bool)) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -57,9 +57,18 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err return nil } - // TODO: Use .gitattributes file for linguist overrides + language := "" + skip := false + if preCheck != nil { + language, skip = preCheck(f.Name) + if skip { + return nil + } + } - language := analyze.GetCodeLanguage(f.Name, content) + if len(language) == 0 { + language = analyze.GetCodeLanguage(f.Name, content) + } if language == enry.OtherLanguage || language == "" { return nil } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index 4c6f07f0fba57..b5320d2c82ecd 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -19,7 +19,7 @@ import ( ) // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { +func (repo *Repository) GetLanguageStats(commitID string, preCheck func(path string) (string, bool)) (map[string]int64, error) { // We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary. // so let's create a batch stdin and stdout @@ -128,10 +128,22 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err continue } - // TODO: Use .gitattributes file for linguist overrides - // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? - // - eg. do the all the detection tests using filename first before reading content. - language := analyze.GetCodeLanguage(f.Name(), content) + // Use .gitattributes file for linguist overrides + language := "" + skip := false + if preCheck != nil { + language, skip = preCheck(f.Name()) + if skip { + continue + } + } + + if len(language) == 0 { + // FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary? + // - eg. do the all the detection tests using filename first before reading content. + language = analyze.GetCodeLanguage(f.Name(), content) + } + if language == enry.OtherLanguage || language == "" { continue } diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index bc3fbc13d8932..100c89e729d4a 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -47,8 +47,34 @@ func (db *DBIndexer) Index(id int64) error { return nil } + commit, err := gitRepo.GetCommit(commitID) + if err != nil { + return err + } + + attrChecker, err := git.LoadAttrbutCheckerFromCommit(commit) + if err != nil { + return err + } + // Calculate and save language statistics to database - stats, err := gitRepo.GetLanguageStats(commitID) + stats, err := gitRepo.GetLanguageStats(commitID, func(path string) (string, bool) { + // get language follow linguist rulers + // linguist-language= attribute to an language + // linguist-vendored attribute to vendor or un-vendor paths + + if attrChecker == nil { + return "", false + } + + r := attrChecker.Check("linguist-vendored", path) + if r.IsSet() || r.Value() == "true" { + return "", true + } + + r = attrChecker.Check("linguist-language", path) + return r.Value(), false + }) if err != nil { log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) return err From a03aa4fbcca2560ef7ce26dc7d38ac4b474e51a8 Mon Sep 17 00:00:00 2001 From: a1012112796 <1012112796@qq.com> Date: Mon, 1 Mar 2021 10:52:30 +0800 Subject: [PATCH 2/4] fix command --- modules/git/repo_attribute.go | 5 +++++ modules/git/repo_attribute_test.go | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index ee303826fde9d..11ccc2e5d8b45 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -166,6 +166,11 @@ func LoadAttrbutCheckerFromReader(r io.Reader) (AttrChecker, error) { continue } + t = strings.TrimLeft(t, " \t\r\n") + if strings.HasPrefix(t, "#") { + continue + } + splits := strings.Split(t, " ") if len(splits) < 2 { continue diff --git a/modules/git/repo_attribute_test.go b/modules/git/repo_attribute_test.go index d48880e8e4494..f8e9385f6ba8b 100644 --- a/modules/git/repo_attribute_test.go +++ b/modules/git/repo_attribute_test.go @@ -13,7 +13,8 @@ import ( func TestCheckAttributeFile(t *testing.T) { testContent := `*.txt text eol=lf -/vendor/** -text -eol linguist-vendored + # test command + /vendor/** -text -eol linguist-vendored /tools/**/*.py linguist-vendored ` r := strings.NewReader(testContent) From d56edc45f8df05b4c159fd51000e3a070964547c Mon Sep 17 00:00:00 2001 From: a1012112796 <1012112796@qq.com> Date: Tue, 2 Mar 2021 17:28:41 +0800 Subject: [PATCH 3/4] use git cmd --- modules/git/repo_attribute.go | 173 ++--------------------------- modules/git/repo_attribute_test.go | 81 -------------- modules/indexer/stats/db.go | 56 ++++++++-- 3 files changed, 51 insertions(+), 259 deletions(-) delete mode 100644 modules/git/repo_attribute_test.go diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index 11ccc2e5d8b45..cd8834613a971 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -5,13 +5,8 @@ package git import ( - "bufio" "bytes" "fmt" - "io" - "strings" - - "github.com/gobwas/glob" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -20,6 +15,7 @@ type CheckAttributeOpts struct { AllAttributes bool Attributes []string Filenames []string + IndexFile string } // CheckAttribute return the Blame object of file @@ -59,7 +55,12 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ cmd := NewCommand(cmdArgs...) - if err := cmd.RunInDirPipeline(repo.Path, stdOut, stdErr); err != nil { + env := make([]string, 0, 1) + if len(opts.IndexFile) > 0 { + env = append(env, "GIT_INDEX_FILE="+opts.IndexFile) + } + + if err := cmd.RunInDirTimeoutEnvFullPipeline(env, -1, repo.Path, stdOut, stdErr, nil); err != nil { return nil, fmt.Errorf("Failed to run check-attr: %v\n%s\n%s", err, stdOut.String(), stdErr.String()) } @@ -85,163 +86,3 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ return name2attribute2info, nil } - -// AttrCheckResultType result type of AttrCheckResult -type AttrCheckResultType int - -const ( - // AttrCheckResultTypeUnspecified the attribute is not defined for the path - AttrCheckResultTypeUnspecified AttrCheckResultType = iota - // AttrCheckResultTypeUnset the attribute is defined as false - AttrCheckResultTypeUnset - // AttrCheckResultTypeSet the attribute is defined as true - AttrCheckResultTypeSet - // AttrCheckResultTypeValue a value has been assigned to the attribute - AttrCheckResultTypeValue -) - -// AttrCheckResult the result of CheckAttributeFile -type AttrCheckResult struct { - typ AttrCheckResultType - data string -} - -// IsSet if the attribute is defined as true -func (r *AttrCheckResult) IsSet() bool { - return r.typ == AttrCheckResultTypeSet -} - -// Value get the value of AttrCheckResult -func (r *AttrCheckResult) Value() string { - if r.typ != AttrCheckResultTypeValue { - return "" - } - return r.data -} - -// AttrChecker Attribute checker -// format attr: partens -type AttrChecker map[string][]*attrCheckerItem - -type attrCheckerItem struct { - pattern glob.Glob - rs *AttrCheckResult -} - -// LoadAttrbutCheckerFromCommit load AttrChecker from a commit -func LoadAttrbutCheckerFromCommit(commit *Commit) (AttrChecker, error) { - gitAttrEntry, err := commit.GetTreeEntryByPath("/.gitattributes") - if err != nil { - if !IsErrNotExist(err) { - return nil, err - } - return nil, nil - } - if gitAttrEntry.IsDir() { - return nil, nil - } - - blob := gitAttrEntry.Blob() - dataRc, err := blob.DataAsync() - if err != nil { - return nil, err - } - defer dataRc.Close() - gitAttr := make([]byte, 1024) - n, _ := dataRc.Read(gitAttr) - gitAttr = gitAttr[:n] - - return LoadAttrbutCheckerFromReader(bytes.NewReader(gitAttr)) -} - -// LoadAttrbutCheckerFromReader load AttrChecker from content reader -func LoadAttrbutCheckerFromReader(r io.Reader) (AttrChecker, error) { - cheker := make(AttrChecker) - - readr := bufio.NewScanner(r) - for readr.Scan() { - t := readr.Text() - // format: pattern attr1 attr2 ... - if len(t) == 0 { - continue - } - - t = strings.TrimLeft(t, " \t\r\n") - if strings.HasPrefix(t, "#") { - continue - } - - splits := strings.Split(t, " ") - if len(splits) < 2 { - continue - } - - // to let `/AAA/*.txt` can match `AAA/bb.txt`, have to - // remove first / if exit - splits[0] = strings.TrimPrefix(splits[0], "/") - - // get parten - g, err := glob.Compile(splits[0], '/') - if err != nil { - return nil, err - } - - check := func(attr string) (string, *AttrCheckResult) { - // one attr may has three status: - // set: XXX - // unset: -XXX - // value: XXX=VVV - if kv := strings.SplitN(attr, "=", 2); len(kv) == 2 { - return kv[0], &AttrCheckResult{ - typ: AttrCheckResultTypeValue, - data: kv[1], - } - } - typ := AttrCheckResultTypeSet - if strings.HasPrefix(attr, "-") { - attr = attr[1:] - typ = AttrCheckResultTypeUnset - } - return attr, &AttrCheckResult{typ: typ} - } - - // check attrs - attrs := splits[1:] - for _, tmp := range attrs { - attr, rs := check(tmp) - v, ok := cheker[attr] - if !ok { - v = make([]*attrCheckerItem, 0, 5) - } - - v = append(v, &attrCheckerItem{ - pattern: g, - rs: rs, - }) - cheker[attr] = v - } - } - - return cheker, nil -} - -// Check check an git attr -func (c AttrChecker) Check(requestAttr, path string) *AttrCheckResult { - if c == nil { - return nil - } - - v, ok := c[requestAttr] - if !ok { - return &AttrCheckResult{typ: AttrCheckResultTypeUnspecified} - } - - for _, item := range v { - if !item.pattern.Match(path) { - continue - } - return item.rs - } - - return &AttrCheckResult{typ: AttrCheckResultTypeUnspecified} -} diff --git a/modules/git/repo_attribute_test.go b/modules/git/repo_attribute_test.go deleted file mode 100644 index f8e9385f6ba8b..0000000000000 --- a/modules/git/repo_attribute_test.go +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2021 The Gitea Authors. All rights reserved. -// Use of this source code is governed by a MIT-style -// license that can be found in the LICENSE file. - -package git - -import ( - "strings" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestCheckAttributeFile(t *testing.T) { - testContent := `*.txt text eol=lf - # test command - /vendor/** -text -eol linguist-vendored -/tools/**/*.py linguist-vendored -` - r := strings.NewReader(testContent) - checker, err := LoadAttrbutCheckerFromReader(r) - if !assert.NoError(t, err) { - return - } - if !assert.NotEmpty(t, checker) { - return - } - - tests := []struct { - want *AttrCheckResult - content string - requestAttr string - path string - }{ - { - want: &AttrCheckResult{ - typ: AttrCheckResultTypeValue, - data: "lf", - }, - path: "aa.txt", - requestAttr: "eol", - }, - { - want: &AttrCheckResult{ - typ: AttrCheckResultTypeUnset, - data: "", - }, - path: "vendor/aa.txt", - requestAttr: "eol", - }, - { - want: &AttrCheckResult{ - typ: AttrCheckResultTypeUnspecified, - data: "", - }, - path: "aa.png", - requestAttr: "text", - }, - { - want: &AttrCheckResult{ - typ: AttrCheckResultTypeSet, - data: "", - }, - path: "vendor/bbb/aa.json", - requestAttr: "linguist-vendored", - }, - // TODO: glob tools/**/*.py should match it, but can't ... - // { - // want: &AttrCheckResult{ - // typ: AttrCheckResultTypeSet, - // data: "", - // }, - // path: "tools/aa.py", - // requestAttr: "linguist-vendored", - // }, - } - for _, tt := range tests { - got := checker.Check(tt.requestAttr, tt.path) - assert.Equal(t, tt.want, got) - } -} diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index 100c89e729d4a..80666d7eaf343 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -5,9 +5,13 @@ package stats import ( + "io/ioutil" + "os" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/util" ) // DBIndexer implements Indexer interface to use database's like search @@ -47,14 +51,24 @@ func (db *DBIndexer) Index(id int64) error { return nil } - commit, err := gitRepo.GetCommit(commitID) - if err != nil { - return err - } + var tmpIndex *os.File + if git.CheckGitVersionAtLeast("1.7.8") == nil { + tmpIndex, err = ioutil.TempFile("", "index") + if err != nil { + return err + } + defer func() { + err := util.Remove(tmpIndex.Name()) + if err != nil { + log.Error("failed to remove tmp index file: %v", err) + } + }() - attrChecker, err := git.LoadAttrbutCheckerFromCommit(commit) - if err != nil { - return err + _, err = git.NewCommand("read-tree", commitID). + RunInDirWithEnv(gitRepo.Path, []string{"GIT_INDEX_FILE=" + tmpIndex.Name()}) + if err != nil { + return err + } } // Calculate and save language statistics to database @@ -63,17 +77,35 @@ func (db *DBIndexer) Index(id int64) error { // linguist-language= attribute to an language // linguist-vendored attribute to vendor or un-vendor paths - if attrChecker == nil { + if tmpIndex == nil { return "", false } - r := attrChecker.Check("linguist-vendored", path) - if r.IsSet() || r.Value() == "true" { + name2attribute2info, err := gitRepo.CheckAttribute(git.CheckAttributeOpts{ + Attributes: []string{"linguist-vendored", "linguist-language"}, + Filenames: []string{path}, + CachedOnly: true, + IndexFile: tmpIndex.Name(), + }) + if err != nil { + log.Error("gitRepo.CheckAttribute: %v", err) + return "", false + } + + attribute2info, has := name2attribute2info[path] + if !has { + return "", false + } + if attribute2info["linguist-vendored"] == "set" { return "", true } - r = attrChecker.Check("linguist-language", path) - return r.Value(), false + lang := attribute2info["linguist-language"] + if lang == "unspecified" { + lang = "" + } + + return lang, false }) if err != nil { log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) From 29ce8387328c09b085b02e877363b45f163f277b Mon Sep 17 00:00:00 2001 From: a1012112796 <1012112796@qq.com> Date: Fri, 5 Mar 2021 01:12:33 +0800 Subject: [PATCH 4/4] try using pip, not finished --- modules/git/repo_attribute.go | 123 ++++++++++++++++++++++++++++++++++ modules/indexer/stats/db.go | 96 +++++++++++++++++--------- 2 files changed, 188 insertions(+), 31 deletions(-) diff --git a/modules/git/repo_attribute.go b/modules/git/repo_attribute.go index cd8834613a971..40c368215efc1 100644 --- a/modules/git/repo_attribute.go +++ b/modules/git/repo_attribute.go @@ -6,7 +6,11 @@ package git import ( "bytes" + "context" "fmt" + "io" + "strings" + "time" ) // CheckAttributeOpts represents the possible options to CheckAttribute @@ -86,3 +90,122 @@ func (repo *Repository) CheckAttribute(opts CheckAttributeOpts) (map[string]map[ return name2attribute2info, nil } + +// AttrChecker attrs checker +type AttrChecker struct { + // params + RequestAttrs []string + Repo *Repository + IndexFile string + + stdinReader *io.PipeReader + stdinWriter *io.PipeWriter + stdOut *lineWriter + cmd *Command + env []string +} + +// Init init cmd +func (c *AttrChecker) Init() { + if len(c.RequestAttrs) == 0 { + panic("Should have RequestAttrs!") + } + + cmdArgs := []string{"check-attr"} + cmdArgs = append(cmdArgs, c.RequestAttrs...) + if len(c.IndexFile) > 0 { + cmdArgs = append(cmdArgs, "--cached") + c.env = []string{"GIT_INDEX_FILE=" + c.IndexFile} + } + cmdArgs = append(cmdArgs, "--stdin") + c.cmd = NewCommand(cmdArgs...) + c.stdinReader, c.stdinWriter = io.Pipe() + c.stdOut = new(lineWriter) +} + +// Run run cmd +func (c *AttrChecker) Run() error { + stdErr := new(bytes.Buffer) + err := c.cmd.RunInDirTimeoutEnvFullPipeline(c.env, -1, c.Repo.Path, c.stdOut, stdErr, c.stdinReader) + if err != nil { + return fmt.Errorf("failed to run attr-check. Error: %w\nStderr: %s", err, stdErr.String()) + } + + return nil +} + +// CheckAttrs check attr for given path +func (c *AttrChecker) CheckAttrs(path string) (map[string]string, error) { + _, err := c.stdinWriter.Write([]byte(path + "\n")) + if err != nil { + return nil, err + } + + rs := make(map[string]string) + for range c.RequestAttrs { + line, err := c.stdOut.ReadLine(DefaultCommandExecutionTimeout) + if err != nil { + return nil, err + } + splits := strings.SplitN(line, ": ", 3) + if len(splits) != 3 { + continue + } + rs[splits[1]] = splits[2] + } + return rs, nil +} + +// Close close pip after use +func (c *AttrChecker) Close() { + c.stdinWriter.Close() +} + +type lineWriter struct { + tmp []byte + lines chan string +} + +func (wr *lineWriter) Write(p []byte) (n int, err error) { + l := len(p) + if wr.tmp != nil && len(wr.tmp) > 0 { + p = append(wr.tmp, p...) + } + lastEndl := -1 + for i := len(p) - 1; i >= 0; i-- { + if p[i] == '\n' { + lastEndl = i + break + } + } + if lastEndl != len(p)-1 { + wr.tmp = p[lastEndl+1:] + } + + if lastEndl == -1 { + return l, nil + } + + if wr.lines == nil { + wr.lines = make(chan string, 5) + } + + splits := bytes.Split(p[:lastEndl], []byte{'\n'}) + for _, line := range splits { + wr.lines <- string(line) + } + + return l, nil +} + +func (wr *lineWriter) ReadLine(timeOut time.Duration) (string, error) { + ctx, cancel := context.WithTimeout(context.Background(), timeOut) + defer cancel() + + select { + case rs := <-wr.lines: + return rs, nil + case <-ctx.Done(): + return "", ctx.Err() + } +} diff --git a/modules/indexer/stats/db.go b/modules/indexer/stats/db.go index 80666d7eaf343..30f3f721280b6 100644 --- a/modules/indexer/stats/db.go +++ b/modules/indexer/stats/db.go @@ -7,6 +7,7 @@ package stats import ( "io/ioutil" "os" + "sync" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/git" @@ -69,44 +70,77 @@ func (db *DBIndexer) Index(id int64) error { if err != nil { return err } - } - - // Calculate and save language statistics to database - stats, err := gitRepo.GetLanguageStats(commitID, func(path string) (string, bool) { - // get language follow linguist rulers - // linguist-language= attribute to an language - // linguist-vendored attribute to vendor or un-vendor paths - if tmpIndex == nil { - return "", false + checker := &git.AttrChecker{ + RequestAttrs: []string{"linguist-vendored", "linguist-language"}, + Repo: gitRepo, + IndexFile: tmpIndex.Name(), } - name2attribute2info, err := gitRepo.CheckAttribute(git.CheckAttributeOpts{ - Attributes: []string{"linguist-vendored", "linguist-language"}, - Filenames: []string{path}, - CachedOnly: true, - IndexFile: tmpIndex.Name(), - }) - if err != nil { - log.Error("gitRepo.CheckAttribute: %v", err) - return "", false - } + checker.Init() - attribute2info, has := name2attribute2info[path] - if !has { - return "", false - } - if attribute2info["linguist-vendored"] == "set" { - return "", true - } + wg := new(sync.WaitGroup) + wg.Add(2) + + errCh := make(chan error) - lang := attribute2info["linguist-language"] - if lang == "unspecified" { - lang = "" + // run cmd + go func() { + if err := checker.Run(); err != nil { + errCh <- err + } + wg.Done() + }() + + stats := make(map[string]int64) + + go func() { + var err error + stats, err = gitRepo.GetLanguageStats(commitID, func(path string) (string, bool) { + // get language follow linguist rulers + // linguist-language= attribute to an language + // linguist-vendored attribute to vendor or un-vendor paths + rs, err := checker.CheckAttrs(path) + if err != nil { + log.Error("git.CheckAttrs: %v", err) + return "", false + } + + if rs["linguist-vendored"] == "set" { + return "", true + } + + if lang, has := rs["linguist-language"]; has { + if lang == "unspecified" { + return "", false + } + return lang, false + } + + return "", false + }) + if err != nil { + errCh <- err + } + checker.Close() + wg.Done() + }() + + wg.Wait() + + select { + case err, has := <-errCh: + if has { + log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) + return err + } + default: + return repo.UpdateLanguageStats(commitID, stats) } + } - return lang, false - }) + // Calculate and save language statistics to database + stats, err := gitRepo.GetLanguageStats(commitID, nil) if err != nil { log.Error("Unable to get language stats for ID %s for defaultbranch %s in %s. Error: %v", commitID, repo.DefaultBranch, repo.RepoPath(), err) return err