se-sic · Copilot · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/codeface/R/analyse_ts.r b/codeface/R/analyse_ts.r
@@ -83,6 +83,7 @@ compute.next.timestamp <- function(time, last.time) {
 ## Take a list of commits and make their date indices unique by
 ## adding a one second offset to identical ones.
 make.index.unique <- function(dat, subset) {
+  if (nrow(dat) == 0) return(dat)
   dat$commitDate <- ymd_hms(dat$commitDate, quiet=TRUE)
   last.timestamp <- min(dat$commitDate) - dseconds(1)
 
@@ -111,6 +112,7 @@ gen.full.ts <- function(conf) {
   }
 
   for (i in 1:length(ts)) {
+    if (is.null(ts[[i]])) next
     ts[[i]]$ChangedLines <- ts[[i]]$AddedLines + ts[[i]]$DeletedLines
     full.series[[i]] <- na.omit(xts(ts[[i]]$ChangedLines,
                                     order.by=ts[[i]]$commitDate))
@@ -140,8 +142,6 @@ gen.rev.list <- function(revisions) {
 ## data point. Using the robust median instead of mean considerably
 ## reduces the amount of outliers
 process.ts <- function(series) {
-  duration <- end(series) - start(series)
-
   ## We compute the window lengths based on natural time units
   ## to avoid dependencies on the lifetime of the project, or on the
   ## project's relative activity
@@ -521,6 +521,11 @@ do.ts.analysis <- function(resdir, graphdir, conf) {
   ## Prepare the raw time series as input to the smoothing
   ## algorithms
   full.ts <- gen.full.ts(conf)
+  if (is.null(full.ts) || length(full.ts) == 0) {
+    logwarn("No commit data found for any release range; skipping time series analysis",
+            logger="analyse_ts")
+    return(invisible(NULL))
+  }
   series.merged <- process.ts(full.ts)
 
   ## Prepare y ranges for the different graph types

diff --git a/codeface/R/ts_utils.r b/codeface/R/ts_utils.r
@@ -20,7 +20,7 @@ suppressPackageStartupMessages(library(dtw))
 ## Omit time series elements that exceed the given range
 trim.series <- function(series, start, end) {
   series <- series[which(index(series) < end),]
-  series <- series[which(index(series) > start),]
+  series <- series[which(index(series) >= start),]
 
   return(series)
 }

diff --git a/codeface/VCS.py b/codeface/VCS.py
@@ -125,6 +125,9 @@ def __init__(self):
         #file names to include in analysis(non-taged based)
         self._fileNames = None
 
+        #when True, skip the source-file extension filter in addFiles4Analysis
+        self._all_files = False
+
         self.subsys_description = {}
 
     def getCommitDict(self):
@@ -145,6 +148,9 @@ def getFileCommitDict(self):
     def setFileNames(self, fileNames):
         self._fileNames = fileNames
 
+    def setAllFiles(self, value):
+        self._all_files = value
+
     def getFileNames(self):
         return self._fileNames
 
@@ -1224,13 +1230,17 @@ def _prepareFileCommitList(self, fnameList, link_type, singleBlame=True,
                 #revision range
                 rev = self.rev_end
 
-            # Check if file has been deleted
-            cmd = "git --git-dir={0} ls-tree".format(self.repo).split()
-            cmd.append("--name-only")
+            # Check if file has been deleted or is a submodule (gitlink)
+            cmd = "git -c core.quotepath=false --git-dir={0} ls-tree".format(self.repo).split()
             cmd.append("--full-tree")
             cmd.append("-r")
             cmd.append(rev)
-            existing_files = execute_command(cmd).split()
+            ls_tree_output = execute_command(cmd).splitlines()
+            # ls-tree output format: "<mode> <type> <hash>\t<filename>"
+            # Exclude gitlinks (mode 160000) which represent git submodules;
+            # git blame cannot be run on a submodule path.
+            existing_files = [line.split('\t', 1)[1] for line in ls_tree_output
+                              if line and not line.startswith('160000')]
             if file_commit.filename in existing_files:
                 # retrieve blame data
                 if singleBlame: #only one set of blame data per file
@@ -1510,6 +1520,16 @@ def _getFunctionLines(self, file_layout_src, file_commit):
             func_lines = self._parseSrcFileCtags(srcFile.name)
             file_commit.artefact_line_range = False
 
+        if not func_lines:
+            # No functions detected by either Doxygen or ctags (e.g., for
+            # Markdown or other non-code files). Fall back to a single
+            # synthetic file-level artefact so that commits touching such
+            # files still populate commit_dependency instead of being dropped
+            # from the dependency-based analysis pipeline entirely.
+            # FILE_LEVEL is an established special entityId in this codebase.
+            func_lines = {0: "FILE_LEVEL"}
+            file_commit.artefact_line_range = True
+
         # clean up src temp file
         srcFile.close()
 
@@ -1562,30 +1582,33 @@ def addFiles4Analysis(self, cmt_id_list):
         -- Input --
         directories - a list of paths to limit the search for filenames
         '''
-        cmd_base = 'git --git-dir={0} diff-tree'.format(self.repo).split()
+        cmd_base = 'git -c core.quotepath=false --git-dir={0} diff-tree'.format(self.repo).split()
         cmd_base.append("--diff-filter=ACMRTB")
         cmd_base.append("--no-commit-id")
         cmd_base.append("--name-only")
         cmd_base.append("-r")
 
         #get all files touched by all commits
-        all_files = set()
+        touched_files = set()
         for cmt_id in cmt_id_list:
             cmd = cmd_base + [cmt_id]
             cmt_files = execute_command(cmd).splitlines()
-            all_files.update(cmt_files)
-
-        #filter results to only get implementation files
-        fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm",
-                   ".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb",
-                   '.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb",
-                   '.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex",
-                   ".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs",
-                   ".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature",
-                   ".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm")
-
-        fileNames = [fileName for fileName in all_files if
-                     fileName.lower().endswith(fileExt)]
+            touched_files.update(cmt_files)
+
+        if self._all_files:
+            fileNames = list(touched_files)
+        else:
+            #filter results to only get implementation files
+            fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm",
+                       ".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb",
+                       '.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb",
+                       '.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex",
+                       ".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs",
+                       ".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature",
+                       ".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm")
+
+            fileNames = [fileName for fileName in touched_files if
+                         fileName.lower().endswith(fileExt)]
 
         self.setFileNames(fileNames)
 

diff --git a/codeface/cli.py b/codeface/cli.py
@@ -79,6 +79,11 @@ def get_parser():
         help="Re-use an already existing vcs-analysis.db file. "
              "This flag is useful to continue a previously failed analysis"
              " or for debugging purposes.")
+    run_parser.add_argument(
+        '--all-files', action='store_true', dest="all_files",
+        help="Include all files touched by commits in the analysis, "
+             "bypassing the built-in source-file extension filter. "
+             "By default only known source-code extensions are included.")
 
     ml_parser = sub_parser.add_parser('ml', help='Run mailing list analysis')
     ml_parser.set_defaults(func=cmd_ml)
@@ -126,7 +131,8 @@ def cmd_run(args):
         logfile = os.path.abspath(logfile)
     project_analyse(resdir, gitdir, codeface_conf, project_conf,
                     args.no_report, args.loglevel, logfile, args.recreate,
-                    args.profile_r, args.jobs, args.tagging, args.reuse_db)
+                    args.profile_r, args.jobs, args.tagging, args.reuse_db,
+                    args.all_files)
     return 0
 
 def cmd_ml(args):

diff --git a/codeface/cluster/cluster.py b/codeface/cluster/cluster.py
@@ -48,7 +48,7 @@
 
 
 def createDB(filename, git_repo, revrange, subsys_descr, link_type,
-             range_by_date, rcranges=None):
+             range_by_date, rcranges=None, all_files=False):
     #------------------
     #configuration
     #------------------
@@ -57,6 +57,7 @@ def createDB(filename, git_repo, revrange, subsys_descr, link_type,
     git.setRevisionRange(revrange[0], revrange[1])
     git.setSubsysDescription(subsys_descr)
     git.setRangeByDate(range_by_date)
+    git.setAllFiles(all_files)
 
     if rcranges != None:
         git.setRCRanges(rcranges)
@@ -1854,7 +1855,8 @@ def performAnalysis(conf, dbm, dbfilename, git_repo, revrange, subsys_descr,
         log.devinfo("Creating data base for {0}..{1}".format(revrange[0],
                                                         revrange[1]))
         createDB(dbfilename, git_repo, revrange, subsys_descr, \
-                 link_type, range_by_date, rcranges)
+                 link_type, range_by_date, rcranges,
+                 all_files=conf.get("all_files", False))
     else:
         log.warning("REUSING data base for {0}..{1} "
                     "(make sure it is up to date)"

diff --git a/codeface/project.py b/codeface/project.py
@@ -54,7 +54,7 @@ def project_setup(conf, recreate):
 
 def project_analyse(resdir, gitdir, codeface_conf, project_conf,
                     no_report, loglevel, logfile, recreate, profile_r,
-                    n_jobs, tagging_type, reuse_db):
+                    n_jobs, tagging_type, reuse_db, all_files=False):
     pool = BatchJobPool(int(n_jobs))
     conf = Configuration.load(codeface_conf, project_conf)
     tagging = conf["tagging"]
@@ -74,6 +74,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf,
     repo = pathjoin(gitdir, conf["repo"], ".git")
     project_resdir = pathjoin(resdir, project, tagging)
     range_by_date = False
+    conf["all_files"] = all_files
 
     # When revisions are not provided by the configuration file
     # generate the analysis window automatically

diff --git a/codeface/util.py b/codeface/util.py
@@ -252,7 +252,13 @@ def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_
     If direct_io is True, do not capture the stdin and stdout of the command.
     Returns the stdout of the command.
     '''
-    jcmd = " ".join(cmd)
+    # In Python 2, cmd may contain a mix of unicode (e.g. repo path from
+    # PyYAML config) and bytes (e.g. filenames from git output).  A plain
+    # " ".join() would then try to decode bytes with non-ASCII content
+    # (like emoji filenames) as ASCII and raise UnicodeDecodeError.
+    # Encode any unicode elements to UTF-8 bytes so the join stays in bytes.
+    jcmd = b" ".join(s.encode('utf-8') if isinstance(s, unicode) else s
+                     for s in cmd)
     log.debug("Running command: {}".format(jcmd))
     try:
         if direct_io:
@@ -526,8 +532,10 @@ def get_before_arg(num_months):
             end = start
             start = end + window_size_months
 
-        # Check if any commits occurred since the last analysis window
-        if rev_start[0] != revs[0]:
+        # Check if any commits occurred since the last analysis window.
+        # revs may be empty if no commit was found by the initial --before query
+        # (e.g. single-commit repo); treat that the same as a new entry.
+        if len(revs) == 0 or rev_start[0] != revs[0]:
             revs = rev_start + revs
         # else: no commit happened since last window, don't add duplicate
         #       revisions
@@ -537,16 +545,21 @@ def get_before_arg(num_months):
     # first commit does not carry the earliest commit date
     revs = [rev.split(",") for rev in revs]
     rev_len = len(revs)
-    if int(revs[0][1]) > int(revs[1][1]):
+    if len(revs) >= 2 and int(revs[0][1]) > int(revs[1][1]):
       del revs[0]
 
-    # Extract hash values and dates intro seperate lists
+    if len(revs) < 2:
+        log.critical("The repository contains only a single commit. "
+                     "At least two commits are required for analysis.")
+        sys.exit(1)
+
+    # Extract hash values and dates into separate lists
     revs_hash = [rev[0] for rev in revs]
     revs_date = [rev[2].split(" ")[0] for rev in revs]
 
-    # We cannot detect release canndidate tags in this analysis mode,
+    # We cannot detect release candidate tags in this analysis mode,
     # so provide a list with None entries
-    rcs = [None for x in range(len(revs))]
+    rcs = [None for x in range(len(revs_hash))]
 
     return revs_hash, rcs, revs_date