Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions codeface/R/analyse_ts.r
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ compute.next.timestamp <- function(time, last.time) {
## Take a list of commits and make their date indices unique by
## adding a one second offset to identical ones.
make.index.unique <- function(dat, subset) {
if (nrow(dat) == 0) return(dat)
dat$commitDate <- ymd_hms(dat$commitDate, quiet=TRUE)
last.timestamp <- min(dat$commitDate) - dseconds(1)

Expand Down Expand Up @@ -111,6 +112,7 @@ gen.full.ts <- function(conf) {
}

for (i in 1:length(ts)) {
if (is.null(ts[[i]])) next
ts[[i]]$ChangedLines <- ts[[i]]$AddedLines + ts[[i]]$DeletedLines
full.series[[i]] <- na.omit(xts(ts[[i]]$ChangedLines,
order.by=ts[[i]]$commitDate))
Expand Down Expand Up @@ -140,8 +142,6 @@ gen.rev.list <- function(revisions) {
## data point. Using the robust median instead of mean considerably
## reduces the amount of outliers
process.ts <- function(series) {
duration <- end(series) - start(series)

## We compute the window lengths based on natural time units
## to avoid dependencies on the lifetime of the project, or on the
## project's relative activity
Expand Down Expand Up @@ -521,6 +521,11 @@ do.ts.analysis <- function(resdir, graphdir, conf) {
## Prepare the raw time series as input to the smoothing
## algorithms
full.ts <- gen.full.ts(conf)
if (is.null(full.ts) || length(full.ts) == 0) {
logwarn("No commit data found for any release range; skipping time series analysis",
logger="analyse_ts")
return(invisible(NULL))
}
series.merged <- process.ts(full.ts)

## Prepare y ranges for the different graph types
Expand Down
2 changes: 1 addition & 1 deletion codeface/R/ts_utils.r
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ suppressPackageStartupMessages(library(dtw))
## Omit time series elements that exceed the given range
trim.series <- function(series, start, end) {
series <- series[which(index(series) < end),]
series <- series[which(index(series) > start),]
series <- series[which(index(series) >= start),]

return(series)
}
Expand Down
61 changes: 42 additions & 19 deletions codeface/VCS.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,9 @@ def __init__(self):
#file names to include in analysis(non-taged based)
self._fileNames = None

#when True, skip the source-file extension filter in addFiles4Analysis
self._all_files = False

self.subsys_description = {}

def getCommitDict(self):
Expand All @@ -145,6 +148,9 @@ def getFileCommitDict(self):
def setFileNames(self, fileNames):
self._fileNames = fileNames

def setAllFiles(self, value):
self._all_files = value

def getFileNames(self):
return self._fileNames

Expand Down Expand Up @@ -1224,13 +1230,17 @@ def _prepareFileCommitList(self, fnameList, link_type, singleBlame=True,
#revision range
rev = self.rev_end

# Check if file has been deleted
cmd = "git --git-dir={0} ls-tree".format(self.repo).split()
cmd.append("--name-only")
# Check if file has been deleted or is a submodule (gitlink)
cmd = "git -c core.quotepath=false --git-dir={0} ls-tree".format(self.repo).split()
cmd.append("--full-tree")
cmd.append("-r")
cmd.append(rev)
existing_files = execute_command(cmd).split()
ls_tree_output = execute_command(cmd).splitlines()
# ls-tree output format: "<mode> <type> <hash>\t<filename>"
# Exclude gitlinks (mode 160000) which represent git submodules;
# git blame cannot be run on a submodule path.
existing_files = [line.split('\t', 1)[1] for line in ls_tree_output
if line and not line.startswith('160000')]
if file_commit.filename in existing_files:
# retrieve blame data
if singleBlame: #only one set of blame data per file
Expand Down Expand Up @@ -1510,6 +1520,16 @@ def _getFunctionLines(self, file_layout_src, file_commit):
func_lines = self._parseSrcFileCtags(srcFile.name)
file_commit.artefact_line_range = False

if not func_lines:
# No functions detected by either Doxygen or ctags (e.g., for
# Markdown or other non-code files). Fall back to a single
# synthetic file-level artefact so that commits touching such
# files still populate commit_dependency instead of being dropped
# from the dependency-based analysis pipeline entirely.
# FILE_LEVEL is an established special entityId in this codebase.
func_lines = {0: "FILE_LEVEL"}
file_commit.artefact_line_range = True

# clean up src temp file
srcFile.close()

Expand Down Expand Up @@ -1562,30 +1582,33 @@ def addFiles4Analysis(self, cmt_id_list):
-- Input --
directories - a list of paths to limit the search for filenames
'''
cmd_base = 'git --git-dir={0} diff-tree'.format(self.repo).split()
cmd_base = 'git -c core.quotepath=false --git-dir={0} diff-tree'.format(self.repo).split()
cmd_base.append("--diff-filter=ACMRTB")
cmd_base.append("--no-commit-id")
cmd_base.append("--name-only")
cmd_base.append("-r")

#get all files touched by all commits
all_files = set()
touched_files = set()
for cmt_id in cmt_id_list:
cmd = cmd_base + [cmt_id]
cmt_files = execute_command(cmd).splitlines()
all_files.update(cmt_files)

#filter results to only get implementation files
fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm",
".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb",
'.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb",
'.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex",
".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs",
".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature",
".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm")

fileNames = [fileName for fileName in all_files if
fileName.lower().endswith(fileExt)]
touched_files.update(cmt_files)

if self._all_files:
fileNames = list(touched_files)
else:
#filter results to only get implementation files
fileExt = (".c", ".cc", ".cpp", ".cxx", ".cs", ".asmx", ".m", ".mm",
".js", ".coffee", ".java", ".j", ".jav", ".php",".py", ".sh", ".ps1", ".rb",
'.d', '.php4', '.php5', '.inc', '.phtml', '.m', '.mm', ".ada", ".erl", ".bb",
'.f', '.for', '.f90', '.idl', '.ddl', '.odl', '.tcl', 'sql', ".q", ".exs", ".ex",
".ru", ".rs", ".ts", ".go", ".dart", ".r", ".rscript", ".vue", # ".hs",
".pl", ".pm", ".swift", ".lua", ".scala", ".sc", ".lisp", ".lsp", # ".feature",
".groovy", ".gy", ".gv", ".gvy", ".gsh", ".kt", ".kts", ".ktm", ".es6", ".jsm")

fileNames = [fileName for fileName in touched_files if
fileName.lower().endswith(fileExt)]

self.setFileNames(fileNames)

Expand Down
8 changes: 7 additions & 1 deletion codeface/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def get_parser():
help="Re-use an already existing vcs-analysis.db file. "
"This flag is useful to continue a previously failed analysis"
" or for debugging purposes.")
run_parser.add_argument(
'--all-files', action='store_true', dest="all_files",
help="Include all files touched by commits in the analysis, "
"bypassing the built-in source-file extension filter. "
"By default only known source-code extensions are included.")

ml_parser = sub_parser.add_parser('ml', help='Run mailing list analysis')
ml_parser.set_defaults(func=cmd_ml)
Expand Down Expand Up @@ -126,7 +131,8 @@ def cmd_run(args):
logfile = os.path.abspath(logfile)
project_analyse(resdir, gitdir, codeface_conf, project_conf,
args.no_report, args.loglevel, logfile, args.recreate,
args.profile_r, args.jobs, args.tagging, args.reuse_db)
args.profile_r, args.jobs, args.tagging, args.reuse_db,
args.all_files)
return 0

def cmd_ml(args):
Expand Down
6 changes: 4 additions & 2 deletions codeface/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@


def createDB(filename, git_repo, revrange, subsys_descr, link_type,
range_by_date, rcranges=None):
range_by_date, rcranges=None, all_files=False):
#------------------
#configuration
#------------------
Expand All @@ -57,6 +57,7 @@ def createDB(filename, git_repo, revrange, subsys_descr, link_type,
git.setRevisionRange(revrange[0], revrange[1])
git.setSubsysDescription(subsys_descr)
git.setRangeByDate(range_by_date)
git.setAllFiles(all_files)

if rcranges != None:
git.setRCRanges(rcranges)
Expand Down Expand Up @@ -1854,7 +1855,8 @@ def performAnalysis(conf, dbm, dbfilename, git_repo, revrange, subsys_descr,
log.devinfo("Creating data base for {0}..{1}".format(revrange[0],
revrange[1]))
createDB(dbfilename, git_repo, revrange, subsys_descr, \
link_type, range_by_date, rcranges)
link_type, range_by_date, rcranges,
all_files=conf.get("all_files", False))
else:
log.warning("REUSING data base for {0}..{1} "
"(make sure it is up to date)"
Expand Down
3 changes: 2 additions & 1 deletion codeface/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def project_setup(conf, recreate):

def project_analyse(resdir, gitdir, codeface_conf, project_conf,
no_report, loglevel, logfile, recreate, profile_r,
n_jobs, tagging_type, reuse_db):
n_jobs, tagging_type, reuse_db, all_files=False):
pool = BatchJobPool(int(n_jobs))
conf = Configuration.load(codeface_conf, project_conf)
tagging = conf["tagging"]
Expand All @@ -74,6 +74,7 @@ def project_analyse(resdir, gitdir, codeface_conf, project_conf,
repo = pathjoin(gitdir, conf["repo"], ".git")
project_resdir = pathjoin(resdir, project, tagging)
range_by_date = False
conf["all_files"] = all_files

# When revisions are not provided by the configuration file
# generate the analysis window automatically
Expand Down
27 changes: 20 additions & 7 deletions codeface/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,13 @@ def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_
If direct_io is True, do not capture the stdin and stdout of the command.
Returns the stdout of the command.
'''
jcmd = " ".join(cmd)
# In Python 2, cmd may contain a mix of unicode (e.g. repo path from
# PyYAML config) and bytes (e.g. filenames from git output). A plain
# " ".join() would then try to decode bytes with non-ASCII content
# (like emoji filenames) as ASCII and raise UnicodeDecodeError.
# Encode any unicode elements to UTF-8 bytes so the join stays in bytes.
jcmd = b" ".join(s.encode('utf-8') if isinstance(s, unicode) else s
for s in cmd)
log.debug("Running command: {}".format(jcmd))
try:
if direct_io:
Expand Down Expand Up @@ -526,8 +532,10 @@ def get_before_arg(num_months):
end = start
start = end + window_size_months

# Check if any commits occurred since the last analysis window
if rev_start[0] != revs[0]:
# Check if any commits occurred since the last analysis window.
# revs may be empty if no commit was found by the initial --before query
# (e.g. single-commit repo); treat that the same as a new entry.
if len(revs) == 0 or rev_start[0] != revs[0]:
revs = rev_start + revs
# else: no commit happened since last window, don't add duplicate
# revisions
Expand All @@ -537,16 +545,21 @@ def get_before_arg(num_months):
# first commit does not carry the earliest commit date
revs = [rev.split(",") for rev in revs]
rev_len = len(revs)
if int(revs[0][1]) > int(revs[1][1]):
if len(revs) >= 2 and int(revs[0][1]) > int(revs[1][1]):
del revs[0]

# Extract hash values and dates intro seperate lists
if len(revs) < 2:
log.critical("The repository contains only a single commit. "
"At least two commits are required for analysis.")
sys.exit(1)

# Extract hash values and dates into separate lists
revs_hash = [rev[0] for rev in revs]
revs_date = [rev[2].split(" ")[0] for rev in revs]

# We cannot detect release canndidate tags in this analysis mode,
# We cannot detect release candidate tags in this analysis mode,
# so provide a list with None entries
rcs = [None for x in range(len(revs))]
rcs = [None for x in range(len(revs_hash))]

return revs_hash, rcs, revs_date

Expand Down