diff --git a/codeface/R/analyse_ts.r b/codeface/R/analyse_ts.r index b0ee3cd2..2b2b654a 100755 --- a/codeface/R/analyse_ts.r +++ b/codeface/R/analyse_ts.r @@ -83,6 +83,7 @@ compute.next.timestamp <- function(time, last.time) { ## Take a list of commits and make their date indices unique by ## adding a one second offset to identical ones. make.index.unique <- function(dat, subset) { + if (nrow(dat) == 0) return(dat) dat$commitDate <- ymd_hms(dat$commitDate, quiet=TRUE) last.timestamp <- min(dat$commitDate) - dseconds(1) @@ -111,6 +112,7 @@ gen.full.ts <- function(conf) { } for (i in 1:length(ts)) { + if (is.null(ts[[i]])) next ts[[i]]$ChangedLines <- ts[[i]]$AddedLines + ts[[i]]$DeletedLines full.series[[i]] <- na.omit(xts(ts[[i]]$ChangedLines, order.by=ts[[i]]$commitDate)) @@ -140,8 +142,6 @@ gen.rev.list <- function(revisions) { ## data point. Using the robust median instead of mean considerably ## reduces the amount of outliers process.ts <- function(series) { - duration <- end(series) - start(series) - ## We compute the window lengths based on natural time units ## to avoid dependencies on the lifetime of the project, or on the ## project's relative activity @@ -521,6 +521,11 @@ do.ts.analysis <- function(resdir, graphdir, conf) { ## Prepare the raw time series as input to the smoothing ## algorithms full.ts <- gen.full.ts(conf) + if (is.null(full.ts) || length(full.ts) == 0) { + logwarn("No commit data found for any release range; skipping time series analysis", + logger="analyse_ts") + return(invisible(NULL)) + } series.merged <- process.ts(full.ts) ## Prepare y ranges for the different graph types diff --git a/codeface/R/ts_utils.r b/codeface/R/ts_utils.r index 1c4c7800..e93f4164 100644 --- a/codeface/R/ts_utils.r +++ b/codeface/R/ts_utils.r @@ -20,7 +20,7 @@ suppressPackageStartupMessages(library(dtw)) ## Omit time series elements that exceed the given range trim.series <- function(series, start, end) { series <- series[which(index(series) < end),] - series <- series[which(index(series) > start),] + series <- series[which(index(series) >= start),] return(series) } diff --git a/codeface/VCS.py b/codeface/VCS.py index bcb4d840..58e781ac 100644 --- a/codeface/VCS.py +++ b/codeface/VCS.py @@ -125,6 +125,9 @@ def __init__(self): #file names to include in analysis(non-taged based) self._fileNames = None + #when True, skip the source-file extension filter in addFiles4Analysis + self._all_files = False + self.subsys_description = {} def getCommitDict(self): @@ -145,6 +148,9 @@ def getFileCommitDict(self): def setFileNames(self, fileNames): self._fileNames = fileNames + def setAllFiles(self, value): + self._all_files = value + def getFileNames(self): return self._fileNames @@ -1224,13 +1230,17 @@ def _prepareFileCommitList(self, fnameList, link_type, singleBlame=True, #revision range rev = self.rev_end - # Check if file has been deleted - cmd = "git --git-dir={0} ls-tree".format(self.repo).split() - cmd.append("--name-only") + # Check if file has been deleted or is a submodule (gitlink) + cmd = "git -c core.quotepath=false --git-dir={0} ls-tree".format(self.repo).split() cmd.append("--full-tree") cmd.append("-r") cmd.append(rev) - existing_files = execute_command(cmd).split() + ls_tree_output = execute_command(cmd).splitlines() + # ls-tree output format: " \t" + # Exclude gitlinks (mode 160000) which represent git submodules; + # git blame cannot be run on a submodule path. + existing_files = [line.split('\t', 1)[1] for line in ls_tree_output + if line and not line.startswith('160000')] if file_commit.filename in existing_files: # retrieve blame data if singleBlame: #only one set of blame data per file @@ -1510,6 +1520,16 @@ def _getFunctionLines(self, file_layout_src, file_commit): func_lines = self._parseSrcFileCtags(srcFile.name) file_commit.artefact_line_range = False + if not func_lines: + # No functions detected by either Doxygen or ctags (e.g., for + # Markdown or other non-code files). Fall back to a single + # synthetic file-level artefact so that commits touching such + # files still populate commit_dependency instead of being dropped + # from the dependency-based analysis pipeline entirely. + # FILE_LEVEL is an established special entityId in this codebase. + func_lines = {0: "FILE_LEVEL"} + file_commit.artefact_line_range = True + # clean up src temp file srcFile.close() @@ -1562,30 +1582,33 @@ def addFiles4Analysis(self, cmt_id_list): -- Input -- directories - a list of paths to limit the search for filenames ''' - cmd_base = 'git --git-dir={0} diff-tree'.format(self.repo).split() + cmd_base = 'git -c core.quotepath=false --git-dir={0} diff-tree'.format(self.repo).split() cmd_base.append("--diff-filter=ACMRTB") cmd_base.append("--no-commit-id") cmd_base.append("--name-only") cmd_base.append("-r") #get all files touched by all commits - all_files = set() + touched_files = set() for cmt_id in cmt_id_list: cmd = cmd_base + [cmt_id] cmt_files = execute_command(cmd).splitlines() - all_files.update(cmt_files) - - #filter results to only get implementation files - fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm", - ".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb", - '.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb", - '.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex", - ".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs", - ".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature", - ".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm") - - fileNames = [fileName for fileName in all_files if - fileName.lower().endswith(fileExt)] + touched_files.update(cmt_files) + + if self._all_files: + fileNames = list(touched_files) + else: + #filter results to only get implementation files + fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm", + ".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb", + '.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb", + '.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex", + ".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs", + ".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature", + ".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm") + + fileNames = [fileName for fileName in touched_files if + fileName.lower().endswith(fileExt)] self.setFileNames(fileNames) diff --git a/codeface/cli.py b/codeface/cli.py index 9f5fef16..cc61129b 100644 --- a/codeface/cli.py +++ b/codeface/cli.py @@ -79,6 +79,11 @@ def get_parser(): help="Re-use an already existing vcs-analysis.db file. " "This flag is useful to continue a previously failed analysis" " or for debugging purposes.") + run_parser.add_argument( + '--all-files', action='store_true', dest="all_files", + help="Include all files touched by commits in the analysis, " + "bypassing the built-in source-file extension filter. " + "By default only known source-code extensions are included.") ml_parser = sub_parser.add_parser('ml', help='Run mailing list analysis') ml_parser.set_defaults(func=cmd_ml) @@ -126,7 +131,8 @@ def cmd_run(args): logfile = os.path.abspath(logfile) project_analyse(resdir, gitdir, codeface_conf, project_conf, args.no_report, args.loglevel, logfile, args.recreate, - args.profile_r, args.jobs, args.tagging, args.reuse_db) + args.profile_r, args.jobs, args.tagging, args.reuse_db, + args.all_files) return 0 def cmd_ml(args): diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py index bf7aaf32..ca1268dc 100755 --- a/codeface/cluster/cluster.py +++ b/codeface/cluster/cluster.py @@ -48,7 +48,7 @@ def createDB(filename, git_repo, revrange, subsys_descr, link_type, - range_by_date, rcranges=None): + range_by_date, rcranges=None, all_files=False): #------------------ #configuration #------------------ @@ -57,6 +57,7 @@ def createDB(filename, git_repo, revrange, subsys_descr, link_type, git.setRevisionRange(revrange[0], revrange[1]) git.setSubsysDescription(subsys_descr) git.setRangeByDate(range_by_date) + git.setAllFiles(all_files) if rcranges != None: git.setRCRanges(rcranges) @@ -1854,7 +1855,8 @@ def performAnalysis(conf, dbm, dbfilename, git_repo, revrange, subsys_descr, log.devinfo("Creating data base for {0}..{1}".format(revrange[0], revrange[1])) createDB(dbfilename, git_repo, revrange, subsys_descr, \ - link_type, range_by_date, rcranges) + link_type, range_by_date, rcranges, + all_files=conf.get("all_files", False)) else: log.warning("REUSING data base for {0}..{1} " "(make sure it is up to date)" diff --git a/codeface/project.py b/codeface/project.py index 1dd9382d..920825a9 100644 --- a/codeface/project.py +++ b/codeface/project.py @@ -54,7 +54,7 @@ def project_setup(conf, recreate): def project_analyse(resdir, gitdir, codeface_conf, project_conf, no_report, loglevel, logfile, recreate, profile_r, - n_jobs, tagging_type, reuse_db): + n_jobs, tagging_type, reuse_db, all_files=False): pool = BatchJobPool(int(n_jobs)) conf = Configuration.load(codeface_conf, project_conf) tagging = conf["tagging"] @@ -74,6 +74,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf, repo = pathjoin(gitdir, conf["repo"], ".git") project_resdir = pathjoin(resdir, project, tagging) range_by_date = False + conf["all_files"] = all_files # When revisions are not provided by the configuration file # generate the analysis window automatically diff --git a/codeface/util.py b/codeface/util.py index 43337eef..eac54b58 100644 --- a/codeface/util.py +++ b/codeface/util.py @@ -252,7 +252,13 @@ def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_ If direct_io is True, do not capture the stdin and stdout of the command. Returns the stdout of the command. ''' - jcmd = " ".join(cmd) + # In Python 2, cmd may contain a mix of unicode (e.g. repo path from + # PyYAML config) and bytes (e.g. filenames from git output). A plain + # " ".join() would then try to decode bytes with non-ASCII content + # (like emoji filenames) as ASCII and raise UnicodeDecodeError. + # Encode any unicode elements to UTF-8 bytes so the join stays in bytes. + jcmd = b" ".join(s.encode('utf-8') if isinstance(s, unicode) else s + for s in cmd) log.debug("Running command: {}".format(jcmd)) try: if direct_io: @@ -526,8 +532,10 @@ def get_before_arg(num_months): end = start start = end + window_size_months - # Check if any commits occurred since the last analysis window - if rev_start[0] != revs[0]: + # Check if any commits occurred since the last analysis window. + # revs may be empty if no commit was found by the initial --before query + # (e.g. single-commit repo); treat that the same as a new entry. + if len(revs) == 0 or rev_start[0] != revs[0]: revs = rev_start + revs # else: no commit happened since last window, don't add duplicate # revisions @@ -537,16 +545,21 @@ def get_before_arg(num_months): # first commit does not carry the earliest commit date revs = [rev.split(",") for rev in revs] rev_len = len(revs) - if int(revs[0][1]) > int(revs[1][1]): + if len(revs) >= 2 and int(revs[0][1]) > int(revs[1][1]): del revs[0] - # Extract hash values and dates intro seperate lists + if len(revs) < 2: + log.critical("The repository contains only a single commit. " + "At least two commits are required for analysis.") + sys.exit(1) + + # Extract hash values and dates into separate lists revs_hash = [rev[0] for rev in revs] revs_date = [rev[2].split(" ")[0] for rev in revs] - # We cannot detect release canndidate tags in this analysis mode, + # We cannot detect release candidate tags in this analysis mode, # so provide a list with None entries - rcs = [None for x in range(len(revs))] + rcs = [None for x in range(len(revs_hash))] return revs_hash, rcs, revs_date