From 916ced1ed2d8bd28b869740294b51e79177874ac Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 11:01:24 -0800 Subject: [PATCH 01/15] add multiprocessing support --- gitfame/_gitfame.py | 85 ++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 31 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 4725492..6577ee8 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -58,7 +58,9 @@ from __future__ import division, print_function import logging +import multiprocessing import os +import queue import re import subprocess # from __future__ import absolute_import @@ -206,10 +208,30 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost # return totals + tighten(tabber(...), max_width=TERM_WIDTH) -def _get_auth_stats(gitdir, branch="HEAD", since=None, include_files=None, exclude_files=None, - silent_progress=False, ignore_whitespace=False, M=False, C=False, - warn_binary=False, bytype=False, show_email=False, prefix_gitdir=False, - churn=None, ignore_rev="", ignore_revs_file=None, until=None): +def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until): + blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) + + log.log(logging.NOTSET, blame_out) + + if since: + # Strip boundary messages, + # preventing user with nearest commit to boundary owning the LOC + blame_out = RE_BLAME_BOUNDS.sub('', blame_out) + + if until: + # Strip boundary messages, + # preventing user with nearest commit to boundary owning the LOC + blame_out = RE_BLAME_BOUNDS.sub('', blame_out) + + return blame_out + + +def _get_auth_stats( + gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, + silent_progress=False, ignore_whitespace=False, M=False, C=False, + warn_binary=False, bytype=False, show_email=False, prefix_gitdir=False, + churn=None, ignore_rev="", ignore_revs_file=None, until=None +): """Returns dict: {"": {"loc": int, "files": {}, "commits": int, "ctimes": [int]}}""" until = ["--until", until] if until else [] since = ["--since", since] if since else [] @@ -271,30 +293,34 @@ def stats_append(fname, auth, loc, tstamp): auth_stats[auth][fext_key] = loc if churn & CHURN_SLOC: - for fname in tqdm(file_list, desc=gitdir if prefix_gitdir else "Processing", - disable=silent_progress, unit="file"): - if prefix_gitdir: - fname = path.join(gitdir, fname) - try: - blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) - except Exception as err: - getattr(log, "warn" if warn_binary else "debug")(fname + ':' + str(err)) - continue - log.log(logging.NOTSET, blame_out) - - if since: - # Strip boundary messages, - # preventing user with nearest commit to boundary owning the LOC - blame_out = RE_BLAME_BOUNDS.sub('', blame_out) - - if until: - # Strip boundary messages, - # preventing user with nearest commit to boundary owning the LOC - blame_out = RE_BLAME_BOUNDS.sub('', blame_out) - - for loc, auth, tstamp in RE_AUTHS_BLAME.findall(blame_out): # for each chunk - loc = int(loc) - stats_append(fname, auth, loc, tstamp) + completed = queue.Queue() + + def process_blame_out(fname, blame_out): + for loc, auth, tstamp in RE_AUTHS_BLAME.findall(blame_out): # for each chunk + stats_append(fname, auth, int(loc), tstamp) + + completed.put(None) + + def process_blame_out_error(fname, err): + getattr(log, "warn" if warn_binary else "debug")(fname + ':' + str(err)) + completed.put(None) + + with multiprocessing.Pool() as mp_pool: + for fname in file_list: + if prefix_gitdir: + fname = path.join(gitdir, fname) + + mp_pool.apply_async( + _get_blame_out, + args=(base_cmd, branch, fname, since, until), + callback=partial(process_blame_out, fname), + error_callback=partial(process_blame_out_error, fname) + ) + + for _ in tqdm(file_list, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, unit="file"): + completed.get() + + mp_pool.join() else: with tqdm(total=1, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, @@ -368,7 +394,6 @@ def run(args): if isinstance(args.gitdir, str): args.gitdir = [args.gitdir] # strip `/`, `.git` - gitdirs = [i.rstrip(os.sep) for i in args.gitdir] gitdirs = [ path.join(*path.split(i)[:-1]) if path.split(i)[-1] == '.git' else i for i in args.gitdir] # remove duplicates @@ -390,8 +415,6 @@ def run(args): dirs.remove('.git') i += 1 - exclude_files = None - include_files = None if args.no_regex: exclude_files = set(RE_CSPILT.split(args.excl)) include_files = set() From 286cfaca83d3704f9789a519fda0123bede4e53f Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 14:51:35 -0800 Subject: [PATCH 02/15] need to close before join --- gitfame/_gitfame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 6577ee8..0e41614 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -320,6 +320,7 @@ def process_blame_out_error(fname, err): for _ in tqdm(file_list, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, unit="file"): completed.get() + mp_pool.close() mp_pool.join() else: From f9ef391902be3a8c643895788189024701517e3b Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 16:52:54 -0800 Subject: [PATCH 03/15] add support for passing # of processes --- gitfame/_gitfame.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 0e41614..8d5e91d 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -54,6 +54,7 @@ Any `tabulate.tabulate_formats` is also accepted. --manpath= Directory in which to install git-fame man pages. --log= FATAL|CRITICAL|ERROR|WARN(ING)|[default: INFO]|DEBUG|NOTSET. + --processes= [default: 1]Number of processes to use for parallelization """ from __future__ import division, print_function @@ -230,7 +231,7 @@ def _get_auth_stats( gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, silent_progress=False, ignore_whitespace=False, M=False, C=False, warn_binary=False, bytype=False, show_email=False, prefix_gitdir=False, - churn=None, ignore_rev="", ignore_revs_file=None, until=None + churn=None, ignore_rev="", ignore_revs_file=None, until=None, processes=1 ): """Returns dict: {"": {"loc": int, "files": {}, "commits": int, "ctimes": [int]}}""" until = ["--until", until] if until else [] @@ -305,7 +306,7 @@ def process_blame_out_error(fname, err): getattr(log, "warn" if warn_binary else "debug")(fname + ':' + str(err)) completed.put(None) - with multiprocessing.Pool() as mp_pool: + with multiprocessing.Pool(processes) as mp_pool: for fname in file_list: if prefix_gitdir: fname = path.join(gitdir, fname) @@ -456,7 +457,7 @@ def run(args): ignore_whitespace=args.ignore_whitespace, M=args.M, C=args.C, warn_binary=args.warn_binary, bytype=args.bytype, show_email=args.show_email, prefix_gitdir=len(gitdirs) > 1, churn=churn, ignore_rev=args.ignore_rev, - ignore_revs_file=args.ignore_revs_file) + ignore_revs_file=args.ignore_revs_file, processes=args.processes) # concurrent multi-repo processing if len(gitdirs) > 1: From bb37570c44fb501449d5bd832d499dd58770fcfa Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 17:10:54 -0800 Subject: [PATCH 04/15] fix typing --- gitfame/_gitfame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 8d5e91d..9da0d92 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -54,7 +54,7 @@ Any `tabulate.tabulate_formats` is also accepted. --manpath= Directory in which to install git-fame man pages. --log= FATAL|CRITICAL|ERROR|WARN(ING)|[default: INFO]|DEBUG|NOTSET. - --processes= [default: 1]Number of processes to use for parallelization + --processes= int, Number of processes to use for parallelization [default: 1] """ from __future__ import division, print_function @@ -457,7 +457,7 @@ def run(args): ignore_whitespace=args.ignore_whitespace, M=args.M, C=args.C, warn_binary=args.warn_binary, bytype=args.bytype, show_email=args.show_email, prefix_gitdir=len(gitdirs) > 1, churn=churn, ignore_rev=args.ignore_rev, - ignore_revs_file=args.ignore_revs_file, processes=args.processes) + ignore_revs_file=args.ignore_revs_file, processes=int(args.processes)) # concurrent multi-repo processing if len(gitdirs) > 1: From ea32224a6a7ab96ac82c211dd2725d46ed045530 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 19:02:54 -0800 Subject: [PATCH 05/15] add support for author canonicalization --- gitfame/_gitfame.py | 84 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 9da0d92..96bd410 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -55,9 +55,12 @@ --manpath= Directory in which to install git-fame man pages. --log= FATAL|CRITICAL|ERROR|WARN(ING)|[default: INFO]|DEBUG|NOTSET. --processes= int, Number of processes to use for parallelization [default: 1] + --author-mapping-file-path= Path to file containing dictionary mapping author name to normalized author name + --author-email-mapping-file-path= Path to file containing dictionary mapping author email address to normalized author name """ from __future__ import division, print_function +import ast import logging import multiprocessing import os @@ -67,6 +70,7 @@ # from __future__ import absolute_import from functools import partial from os import path +from pathlib import Path from ._utils import (TERM_WIDTH, Str, TqdmStream, check_output, fext, int_cast_or_len, merge_stats, print_unicode, tqdm) @@ -209,29 +213,70 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost # return totals + tighten(tabber(...), max_width=TERM_WIDTH) +_RE_BLAME_START_LINE = re.compile(r'^(?P[a-f0-9]+) (?P\d+) (?P\d+) ?(?P\d+)?$') + def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until): blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) + commit_info = dict() + commit_infos = [] + for line in blame_out.splitlines(): + if match := _RE_BLAME_START_LINE.match(line): + if commit_info: + commit_infos.append(commit_info) + commit_info = {'loc': match['lines_of_code']} + elif line.startswith('\t'): + continue + else: + key, value = line.split(' ', 1) + commit_info[key] = value + + if commit_info: + commit_infos.append(commit_info) + log.log(logging.NOTSET, blame_out) - if since: - # Strip boundary messages, - # preventing user with nearest commit to boundary owning the LOC - blame_out = RE_BLAME_BOUNDS.sub('', blame_out) + # TODO + assert not since and not until + + # if since: + # # Strip boundary messages, + # # preventing user with nearest commit to boundary owning the LOC + # blame_out = RE_BLAME_BOUNDS.sub('', blame_out) + # + # if until: + # # Strip boundary messages, + # # preventing user with nearest commit to boundary owning the LOC + # blame_out = RE_BLAME_BOUNDS.sub('', blame_out) + + return commit_infos - if until: - # Strip boundary messages, - # preventing user with nearest commit to boundary owning the LOC - blame_out = RE_BLAME_BOUNDS.sub('', blame_out) - return blame_out +def _get_user_canonicalization_function(author_mapping_file_path: str = None, author_email_mapping_file_path: str = None): + user_mappings = dict() + if author_mapping_file_path: + with Path(author_mapping_file_path).expanduser().open('rt') as f: + user_mappings = ast.literal_eval(f.read()) + + email_mappings = dict() + if author_email_mapping_file_path: + with Path(author_email_mapping_file_path).expanduser().open('rt') as f: + email_mappings = ast.literal_eval(f.read()) + + def canonicalize(author: str, author_email: str): + author = user_mappings.get(author, author) + author = email_mappings.get(author_email, author) + return author + + return canonicalize def _get_auth_stats( gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, silent_progress=False, ignore_whitespace=False, M=False, C=False, warn_binary=False, bytype=False, show_email=False, prefix_gitdir=False, - churn=None, ignore_rev="", ignore_revs_file=None, until=None, processes=1 + churn=None, ignore_rev="", ignore_revs_file=None, until=None, processes: int = 1, + author_mapping_file_path: str = None, author_email_mapping_file_path: str = None, ): """Returns dict: {"": {"loc": int, "files": {}, "commits": int, "ctimes": [int]}}""" until = ["--until", until] if until else [] @@ -274,9 +319,13 @@ def _get_auth_stats( auth_stats = {} - def stats_append(fname, auth, loc, tstamp): + author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, author_email_mapping_file_path) + + def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str): auth = str(auth) + auth = author_canonicalizer(auth, author_email) tstamp = int(tstamp) + try: auth_stats[auth]["loc"] += loc except KeyError: @@ -296,9 +345,10 @@ def stats_append(fname, auth, loc, tstamp): if churn & CHURN_SLOC: completed = queue.Queue() - def process_blame_out(fname, blame_out): - for loc, auth, tstamp in RE_AUTHS_BLAME.findall(blame_out): # for each chunk - stats_append(fname, auth, int(loc), tstamp) + def process_blame_out(blame_out): + for info in blame_out: # for each chunk + if info['loc']: + stats_append(info['filename'], info['author'], int(info['loc']), info['committer-time'], info['author-mail']) completed.put(None) @@ -314,7 +364,7 @@ def process_blame_out_error(fname, err): mp_pool.apply_async( _get_blame_out, args=(base_cmd, branch, fname, since, until), - callback=partial(process_blame_out, fname), + callback=process_blame_out, error_callback=partial(process_blame_out_error, fname) ) @@ -457,7 +507,9 @@ def run(args): ignore_whitespace=args.ignore_whitespace, M=args.M, C=args.C, warn_binary=args.warn_binary, bytype=args.bytype, show_email=args.show_email, prefix_gitdir=len(gitdirs) > 1, churn=churn, ignore_rev=args.ignore_rev, - ignore_revs_file=args.ignore_revs_file, processes=int(args.processes)) + ignore_revs_file=args.ignore_revs_file, processes=int(args.processes), + author_mapping_file_path=args.author_mapping_file_path, + author_email_mapping_file_path=args.author_email_mapping_file_path) # concurrent multi-repo processing if len(gitdirs) > 1: From 01a3138547d56444002eb268de6984f88632c4bb Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 23:09:25 -0800 Subject: [PATCH 06/15] rework binary detection and loc logic --- gitfame/_gitfame.py | 108 +++++++++++++++++++++++++++++++------------- 1 file changed, 77 insertions(+), 31 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 96bd410..241e628 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -61,16 +61,19 @@ from __future__ import division, print_function import ast +import codecs import logging import multiprocessing import os import queue import re import subprocess +from collections import defaultdict # from __future__ import absolute_import from functools import partial from os import path from pathlib import Path +from typing import Dict from ._utils import (TERM_WIDTH, Str, TqdmStream, check_output, fext, int_cast_or_len, merge_stats, print_unicode, tqdm) @@ -215,26 +218,34 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost _RE_BLAME_START_LINE = re.compile(r'^(?P[a-f0-9]+) (?P\d+) (?P\d+) ?(?P\d+)?$') -def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until): +class _CommitInfo: + def __init__(self): + self.file_locs = defaultdict(int) # {file_name: [loc, ... + self.info = dict() + + +def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until) -> Dict[str, _CommitInfo]: blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) - commit_info = dict() - commit_infos = [] + commit_infos = defaultdict(_CommitInfo) # {commit: {file: commit_info + commit = None + loc = None + for line in blame_out.splitlines(): if match := _RE_BLAME_START_LINE.match(line): - if commit_info: - commit_infos.append(commit_info) - commit_info = {'loc': match['lines_of_code']} + commit = match['commit_hash'] + loc = int(match['lines_of_code']) # needs to be applied to each file of the commit elif line.startswith('\t'): continue + elif line == 'boundary': + continue else: key, value = line.split(' ', 1) - commit_info[key] = value - - if commit_info: - commit_infos.append(commit_info) - log.log(logging.NOTSET, blame_out) + if key == 'filename': + commit_infos[commit].file_locs[value] += loc + else: + commit_infos[commit].info[key] = value # TODO assert not since and not until @@ -249,7 +260,10 @@ def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until): # # preventing user with nearest commit to boundary owning the LOC # blame_out = RE_BLAME_BOUNDS.sub('', blame_out) - return commit_infos + for cinfo in commit_infos.values(): + cinfo.file_locs = dict(cinfo.file_locs) + + return dict(commit_infos) def _get_user_canonicalization_function(author_mapping_file_path: str = None, author_email_mapping_file_path: str = None): @@ -271,6 +285,24 @@ def canonicalize(author: str, author_email: str): return canonicalize +_RE_EOL_LINE = re.compile(r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$') + + +def detect_bom(path: str, default = None): + with open(path, 'rb') as f: + raw = f.read(4) # will read less if the file is smaller + + # BOM_UTF32_LE's start is equal to BOM_UTF16_LE so need to try the former first + for enc, boms in ( + ('utf-8-sig', (codecs.BOM_UTF8,)), + ('utf-32', (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)), + ('utf-16', (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) + ): + if any(raw.startswith(bom) for bom in boms): + return enc + + return default + def _get_auth_stats( gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, silent_progress=False, ignore_whitespace=False, M=False, C=False, @@ -283,26 +315,40 @@ def _get_auth_stats( since = ["--since", since] if since else [] git_cmd = ["git", "-C", gitdir] log.debug("base command:%s", git_cmd) - file_list = check_output(git_cmd + ["ls-files", "--with-tree", branch]).strip().split('\n') - text_file_list = check_output(git_cmd + ["grep", "-I", "--name-only", ".", branch]).strip() - text_file_list = set( - re.sub(f"^{re.escape(branch)}:", "", text_file_list, flags=re.M).split('\n')) - if not hasattr(include_files, 'search'): - file_list = [ - i for i in file_list if (not include_files or (i in include_files)) - if i not in exclude_files] - else: - file_list = [ - i for i in file_list if include_files.search(i) - if not (exclude_files and exclude_files.search(i))] - for fname in set(file_list) - text_file_list: + + file_list = check_output(git_cmd + ["ls-files", "--eol", "--with-tree", branch]).strip().splitlines() + binary_file_list = [] + text_file_list = [] + for f in file_list: + m = _RE_EOL_LINE.match(f) + fpath = m['fpath'] + + if not hasattr(include_files, 'search'): + if (include_files and fpath not in include_files) or fpath in exclude_files: + continue + elif (not include_files.search(fpath)) or (exclude_files and exclude_files.search(fpath)): + continue + + if m['eol_worktree'] == 'w/-text': + binary_file_list.append(fpath) + else: + text_file_list.append(fpath) + + # we need to inspect if the binary_files are unicode + for f in list(binary_file_list): + if detect_bom(f): + binary_file_list.remove(f) + text_file_list.append(f) + + for fname in binary_file_list: getattr(log, "warn" if warn_binary else "debug")("binary:%s", fname.strip()) - file_list = [f for f in file_list if f in text_file_list] # preserve order + + file_list = text_file_list # preserve order log.log(logging.NOTSET, "files:%s", file_list) churn = churn or set() if churn & CHURN_SLOC: - base_cmd = git_cmd + ["blame", "--line-porcelain"] + since + until + base_cmd = git_cmd + ["blame", "--line-porcelain", "--incremental"] + since + until if ignore_rev: base_cmd.extend(["--ignore-rev", ignore_rev]) if ignore_revs_file: @@ -345,10 +391,10 @@ def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str if churn & CHURN_SLOC: completed = queue.Queue() - def process_blame_out(blame_out): - for info in blame_out: # for each chunk - if info['loc']: - stats_append(info['filename'], info['author'], int(info['loc']), info['committer-time'], info['author-mail']) + def process_blame_out(commit_infos: Dict[str, _CommitInfo]): + for cinfo in commit_infos.values(): # for each chunk + for fname, loc in cinfo.file_locs.items(): + stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], cinfo.info['author-mail']) completed.put(None) From bb0478fbf04b8f7781166aaa50000caeaf9da736 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Wed, 20 Nov 2024 23:15:10 -0800 Subject: [PATCH 07/15] pathfix --- gitfame/_gitfame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 241e628..4c1b010 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -218,6 +218,7 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost _RE_BLAME_START_LINE = re.compile(r'^(?P[a-f0-9]+) (?P\d+) (?P\d+) ?(?P\d+)?$') + class _CommitInfo: def __init__(self): self.file_locs = defaultdict(int) # {file_name: [loc, ... @@ -336,7 +337,7 @@ def _get_auth_stats( # we need to inspect if the binary_files are unicode for f in list(binary_file_list): - if detect_bom(f): + if detect_bom(path.join(gitdir, f)): binary_file_list.remove(f) text_file_list.append(f) From f81de3776ded5e05b9d86eb8155a7176bf5a2f29 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 01:15:24 -0800 Subject: [PATCH 08/15] update other codepath, optimize, and make more readable --- gitfame/_gitfame.py | 97 +++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 4c1b010..d5508c1 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -103,7 +103,7 @@ r'^\w+\s+\d+\s+\d+(\s+\d+)?\s*$[^\t]*?^boundary\s*$[^\t]*?^\t.*?$\r?\n', flags=re.M | re.DOTALL) # processing `log --format="aN%aN ct%ct" --numstat` -RE_AUTHS_LOG = re.compile(r"^aN(.+?) ct(\d+)\n\n", flags=re.M) +RE_AUTHS_LOG = re.compile(r"^aN(.+?) aE(.+?) H([a-f0-9]+) ct(\d+)\n\n", flags=re.M) RE_STAT_BINARY = re.compile(r"^\s*?-\s*-.*?\n", flags=re.M) RE_RENAME = re.compile(r"\{.+? => (.+?)\}") # finds all non-escaped commas @@ -139,15 +139,25 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost COL_NAMES = ['Author', 'loc', 'coms', 'fils', ' distribution'] it_as = getattr(auth_stats, 'iteritems', auth_stats.items) # get ready - tab = [[ - auth, s['loc'], - s.get('commits', 0), - len(s.get('files', [])), '/'.join( - map('{0:4.1f}'.format, - (100 * s['loc'] / max(1, stats_tot['loc']), - 100 * s.get('commits', 0) / max(1, stats_tot['commits']), - 100 * len(s.get('files', [])) / max(1, stats_tot['files'])))).replace( - '/100.0/', '/ 100/')] for (auth, s) in it_as()] + tab = [ + [ + auth, + s['loc'], + s.get('commits', 0), + len(s.get('files', [])), + '/'.join( + map( + '{0:4.1f}'.format, + ( + 100 * s['loc'] / max(1, stats_tot['loc']), + 100 * s.get('commits', 0) / max(1, stats_tot['commits']), + 100 * len(s.get('files', [])) / max(1, stats_tot['files']) + ) + ) + ).replace('/100.0/', '/ 100/') + ] + for (auth, s) in it_as() + ] if cost: stats_tot = dict(stats_tot) if cost & COST_MONTHS: @@ -355,7 +365,7 @@ def _get_auth_stats( if ignore_revs_file: base_cmd.extend(["--ignore-revs-file", ignore_revs_file]) else: - base_cmd = git_cmd + ["log", "--format=aN%aN ct%ct", "--numstat"] + since + until + base_cmd = git_cmd + ["log", "--format=aN%aN aE%aE H%H ct%ct", "--numstat"] + since + until if ignore_whitespace: base_cmd.append("-w") @@ -364,38 +374,37 @@ def _get_auth_stats( if C: base_cmd.extend(["-C", "-C"]) # twice to include file creation - auth_stats = {} + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': set()}) # {author: {[loc,files,ctimes,exts]: + auth2em = defaultdict(set) author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, author_email_mapping_file_path) - def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str): - auth = str(auth) + def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str, commit_id: str): auth = author_canonicalizer(auth, author_email) tstamp = int(tstamp) - try: - auth_stats[auth]["loc"] += loc - except KeyError: - auth_stats[auth] = {"loc": loc, "files": {fname}, "ctimes": []} - else: - auth_stats[auth]["files"].add(fname) - auth_stats[auth]["ctimes"].append(tstamp) + auth2em[auth].add(author_email) + + i = auth_stats[auth] + i["loc"] += loc + i["files"].add(fname) + i["ctimes"].append(tstamp) + i['commits'].add(commit_id) if bytype: fext_key = f".{fext(fname) or '_None_ext'}" - # auth_stats[auth].setdefault(fext_key, 0) try: - auth_stats[auth][fext_key] += loc + i[fext_key] += loc except KeyError: - auth_stats[auth][fext_key] = loc + i[fext_key] = loc if churn & CHURN_SLOC: completed = queue.Queue() def process_blame_out(commit_infos: Dict[str, _CommitInfo]): - for cinfo in commit_infos.values(): # for each chunk + for commit_id, cinfo in commit_infos.items(): # for each chunk for fname, loc in cinfo.file_locs.items(): - stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], cinfo.info['author-mail']) + stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], cinfo.info['author-mail'], commit_id) completed.put(None) @@ -420,10 +429,11 @@ def process_blame_out_error(fname, err): mp_pool.close() mp_pool.join() - else: - with tqdm(total=1, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, - unit="repo") as t: + with tqdm( + total=1, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, + unit="repo" + ) as t: blame_out = check_output(base_cmd + [branch], stderr=subprocess.STDOUT) t.update() log.log(logging.NOTSET, blame_out) @@ -434,8 +444,8 @@ def process_blame_out_error(fname, err): blame_out = RE_STAT_BINARY.sub('', blame_out) blame_out = RE_AUTHS_LOG.split(blame_out) - blame_out = zip(blame_out[1::3], blame_out[2::3], blame_out[3::3]) - for auth, tstamp, fnames in blame_out: + blame_out = zip(blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], blame_out[5::5]) + for auth, auth_email, commit_hash, tstamp, fnames in blame_out: fnames = fnames.split('\naN', 1)[0] for i in fnames.strip().split('\n'): try: @@ -446,30 +456,23 @@ def process_blame_out_error(fname, err): fname = RE_RENAME.sub(r'\\2', fname) loc = (int(inss) if churn & CHURN_INS and inss else 0) + (int(dels) if churn & CHURN_DEL and dels else 0) - stats_append(fname, auth, loc, tstamp) + stats_append(fname, auth, int(loc), tstamp, auth_email, commit_hash) + + # translate commit-ids to # of commits + for astat in auth_stats.values(): + astat['commits'] = len(astat['commits']) # quickly count commits (even if no surviving loc) log.log(logging.NOTSET, "authors:%s", list(auth_stats.keys())) - auth_commits = check_output(git_cmd + ["shortlog", "-s", "-e", branch] + since + until) - for stats in auth_stats.values(): - stats.setdefault("commits", 0) - log.debug(RE_NCOM_AUTH_EM.findall(auth_commits.strip())) - auth2em = {} - for (ncom, auth, em) in RE_NCOM_AUTH_EM.findall(auth_commits.strip()): - auth = str(auth) - auth2em[auth] = em # TODO: count most used email? - try: - auth_stats[auth]["commits"] += int(ncom) - except KeyError: - auth_stats[auth] = {"loc": 0, "files": set(), "commits": int(ncom), "ctimes": []} + if show_email: # replace author name with email log.debug(auth2em) old = auth_stats - auth_stats = {} + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': 0}) for auth, stats in getattr(old, 'iteritems', old.items)(): - i = auth_stats.setdefault(auth2em[auth], - {"loc": 0, "files": set(), "commits": 0, "ctimes": []}) + auth_email = list(auth2em[auth])[0] # TODO: count most used email? + i = auth_stats[auth_email] i["loc"] += stats["loc"] i["files"].update(stats["files"]) i["commits"] += stats["commits"] From c64d7b6eb7970cdc52c08296aa99026f7c75c0e6 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 10:02:53 -0800 Subject: [PATCH 09/15] flake --- .editorconfig | 13 ++++++++++ gitfame/_gitfame.py | 61 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 .editorconfig diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..d328b29 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,13 @@ +# https://editorconfig.org/ + +root = true + +[*.py] +# most closely replicates what we've already been doing +max_line_length = 99 +indent_style = space +indent_size = 4 +insert_final_newline = true +trim_trailing_whitespace = true +end_of_line = lf +charset = utf-8 diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index d5508c1..273e6c8 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -55,8 +55,10 @@ --manpath= Directory in which to install git-fame man pages. --log= FATAL|CRITICAL|ERROR|WARN(ING)|[default: INFO]|DEBUG|NOTSET. --processes= int, Number of processes to use for parallelization [default: 1] - --author-mapping-file-path= Path to file containing dictionary mapping author name to normalized author name - --author-email-mapping-file-path= Path to file containing dictionary mapping author email address to normalized author name + --author-mapping-file-path= Path to file containing dictionary mapping author name + to normalized author name + --author-email-mapping-file-path= Path to file containing dictionary mapping author + email address to normalized author name """ from __future__ import division, print_function @@ -226,7 +228,10 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost # return totals + tighten(tabber(...), max_width=TERM_WIDTH) -_RE_BLAME_START_LINE = re.compile(r'^(?P[a-f0-9]+) (?P\d+) (?P\d+) ?(?P\d+)?$') +_RE_BLAME_START_LINE = re.compile( + r'^(?P[a-f0-9]+) (?P\d+) ' + r'(?P\d+) ?(?P\d+)?$' +) class _CommitInfo: @@ -235,7 +240,9 @@ def __init__(self): self.info = dict() -def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until) -> Dict[str, _CommitInfo]: +def _get_blame_out( + base_cmd: list[str], branch: str, fname: str, since, until +) -> Dict[str, _CommitInfo]: blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) commit_infos = defaultdict(_CommitInfo) # {commit: {file: commit_info @@ -277,7 +284,9 @@ def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until) - return dict(commit_infos) -def _get_user_canonicalization_function(author_mapping_file_path: str = None, author_email_mapping_file_path: str = None): +def _get_user_canonicalization_function( + author_mapping_file_path: str = None, author_email_mapping_file_path: str = None +): user_mappings = dict() if author_mapping_file_path: with Path(author_mapping_file_path).expanduser().open('rt') as f: @@ -296,7 +305,9 @@ def canonicalize(author: str, author_email: str): return canonicalize -_RE_EOL_LINE = re.compile(r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$') +_RE_EOL_LINE = re.compile( + r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$' +) def detect_bom(path: str, default = None): @@ -327,7 +338,9 @@ def _get_auth_stats( git_cmd = ["git", "-C", gitdir] log.debug("base command:%s", git_cmd) - file_list = check_output(git_cmd + ["ls-files", "--eol", "--with-tree", branch]).strip().splitlines() + file_list = check_output( + git_cmd + ["ls-files", "--eol", "--with-tree", branch] + ).strip().splitlines() binary_file_list = [] text_file_list = [] for f in file_list: @@ -374,12 +387,18 @@ def _get_auth_stats( if C: base_cmd.extend(["-C", "-C"]) # twice to include file creation - auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': set()}) # {author: {[loc,files,ctimes,exts]: + auth_stats = defaultdict( + lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': set()} + ) # {author: {[loc,files,ctimes,exts]: auth2em = defaultdict(set) - author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, author_email_mapping_file_path) + author_canonicalizer = _get_user_canonicalization_function( + author_mapping_file_path, author_email_mapping_file_path + ) - def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str, commit_id: str): + def stats_append( + fname: str, auth: str, loc: int, tstamp: str, author_email: str, commit_id: str + ): auth = author_canonicalizer(auth, author_email) tstamp = int(tstamp) @@ -404,7 +423,14 @@ def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str def process_blame_out(commit_infos: Dict[str, _CommitInfo]): for commit_id, cinfo in commit_infos.items(): # for each chunk for fname, loc in cinfo.file_locs.items(): - stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], cinfo.info['author-mail'], commit_id) + stats_append( + fname, + cinfo.info['author'], + loc, + cinfo.info['committer-time'], + cinfo.info['author-mail'], + commit_id + ) completed.put(None) @@ -424,7 +450,10 @@ def process_blame_out_error(fname, err): error_callback=partial(process_blame_out_error, fname) ) - for _ in tqdm(file_list, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, unit="file"): + for _ in tqdm( + file_list, desc=gitdir if prefix_gitdir else "Processing", + disable=silent_progress, unit="file" + ): completed.get() mp_pool.close() @@ -444,7 +473,9 @@ def process_blame_out_error(fname, err): blame_out = RE_STAT_BINARY.sub('', blame_out) blame_out = RE_AUTHS_LOG.split(blame_out) - blame_out = zip(blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], blame_out[5::5]) + blame_out = zip( + blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], blame_out[5::5] + ) for auth, auth_email, commit_hash, tstamp, fnames in blame_out: fnames = fnames.split('\naN', 1)[0] for i in fnames.strip().split('\n'): @@ -468,7 +499,9 @@ def process_blame_out_error(fname, err): if show_email: # replace author name with email log.debug(auth2em) old = auth_stats - auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': 0}) + auth_stats = defaultdict( + lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': 0} + ) for auth, stats in getattr(old, 'iteritems', old.items)(): auth_email = list(auth2em[auth])[0] # TODO: count most used email? From 739eec4270508aedbaf559ff3517ae1b9fc769ac Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 18:54:44 -0800 Subject: [PATCH 10/15] flake --- gitfame/_gitfame.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 273e6c8..0866db1 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -237,7 +237,7 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost class _CommitInfo: def __init__(self): self.file_locs = defaultdict(int) # {file_name: [loc, ... - self.info = dict() + self.info = {} def _get_blame_out( @@ -287,12 +287,12 @@ def _get_blame_out( def _get_user_canonicalization_function( author_mapping_file_path: str = None, author_email_mapping_file_path: str = None ): - user_mappings = dict() + user_mappings = {} if author_mapping_file_path: with Path(author_mapping_file_path).expanduser().open('rt') as f: user_mappings = ast.literal_eval(f.read()) - email_mappings = dict() + email_mappings = {} if author_email_mapping_file_path: with Path(author_email_mapping_file_path).expanduser().open('rt') as f: email_mappings = ast.literal_eval(f.read()) @@ -310,7 +310,7 @@ def canonicalize(author: str, author_email: str): ) -def detect_bom(path: str, default = None): +def detect_bom(path: str, default=None): with open(path, 'rb') as f: raw = f.read(4) # will read less if the file is smaller @@ -325,6 +325,7 @@ def detect_bom(path: str, default = None): return default + def _get_auth_stats( gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, silent_progress=False, ignore_whitespace=False, M=False, C=False, @@ -388,7 +389,7 @@ def _get_auth_stats( base_cmd.extend(["-C", "-C"]) # twice to include file creation auth_stats = defaultdict( - lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': set()} + lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()} ) # {author: {[loc,files,ctimes,exts]: auth2em = defaultdict(set) @@ -500,7 +501,7 @@ def process_blame_out_error(fname, err): log.debug(auth2em) old = auth_stats auth_stats = defaultdict( - lambda: {'loc': 0, 'files': set(), 'ctimes': list(), 'commits': 0} + lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': 0} ) for auth, stats in getattr(old, 'iteritems', old.items)(): From f9bbd13fb1560713f026ccb34280f4767eca4c09 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 18:57:08 -0800 Subject: [PATCH 11/15] yapf --- gitfame/_gitfame.py | 156 +++++++++++++++++++------------------------- 1 file changed, 67 insertions(+), 89 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 0866db1..0743d74 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -141,25 +141,15 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost COL_NAMES = ['Author', 'loc', 'coms', 'fils', ' distribution'] it_as = getattr(auth_stats, 'iteritems', auth_stats.items) # get ready - tab = [ - [ - auth, - s['loc'], - s.get('commits', 0), - len(s.get('files', [])), - '/'.join( - map( - '{0:4.1f}'.format, - ( - 100 * s['loc'] / max(1, stats_tot['loc']), - 100 * s.get('commits', 0) / max(1, stats_tot['commits']), - 100 * len(s.get('files', [])) / max(1, stats_tot['files']) - ) - ) - ).replace('/100.0/', '/ 100/') - ] - for (auth, s) in it_as() - ] + tab = [[ + auth, s['loc'], + s.get('commits', 0), + len(s.get('files', [])), '/'.join( + map('{0:4.1f}'.format, + (100 * s['loc'] / max(1, stats_tot['loc']), + 100 * s.get('commits', 0) / max(1, stats_tot['commits']), + 100 * len(s.get('files', [])) / max(1, stats_tot['files'])))).replace( + '/100.0/', '/ 100/')] for (auth, s) in it_as()] if cost: stats_tot = dict(stats_tot) if cost & COST_MONTHS: @@ -228,31 +218,28 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost # return totals + tighten(tabber(...), max_width=TERM_WIDTH) -_RE_BLAME_START_LINE = re.compile( - r'^(?P[a-f0-9]+) (?P\d+) ' - r'(?P\d+) ?(?P\d+)?$' -) +_RE_BLAME_START_LINE = re.compile(r'^(?P[a-f0-9]+) (?P\d+) ' + r'(?P\d+) ?(?P\d+)?$') class _CommitInfo: def __init__(self): - self.file_locs = defaultdict(int) # {file_name: [loc, ... + self.file_locs = defaultdict(int) # {file_name: [loc, ... self.info = {} -def _get_blame_out( - base_cmd: list[str], branch: str, fname: str, since, until -) -> Dict[str, _CommitInfo]: +def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, + until) -> Dict[str, _CommitInfo]: blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) - commit_infos = defaultdict(_CommitInfo) # {commit: {file: commit_info + commit_infos = defaultdict(_CommitInfo) # {commit: {file: commit_info commit = None loc = None for line in blame_out.splitlines(): if match := _RE_BLAME_START_LINE.match(line): commit = match['commit_hash'] - loc = int(match['lines_of_code']) # needs to be applied to each file of the commit + loc = int(match['lines_of_code']) # needs to be applied to each file of the commit elif line.startswith('\t'): continue elif line == 'boundary': @@ -284,9 +271,8 @@ def _get_blame_out( return dict(commit_infos) -def _get_user_canonicalization_function( - author_mapping_file_path: str = None, author_email_mapping_file_path: str = None -): +def _get_user_canonicalization_function(author_mapping_file_path: str = None, + author_email_mapping_file_path: str = None): user_mappings = {} if author_mapping_file_path: with Path(author_mapping_file_path).expanduser().open('rt') as f: @@ -306,20 +292,17 @@ def canonicalize(author: str, author_email: str): _RE_EOL_LINE = re.compile( - r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$' -) + r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$') def detect_bom(path: str, default=None): with open(path, 'rb') as f: - raw = f.read(4) # will read less if the file is smaller + raw = f.read(4) # will read less if the file is smaller # BOM_UTF32_LE's start is equal to BOM_UTF16_LE so need to try the former first - for enc, boms in ( - ('utf-8-sig', (codecs.BOM_UTF8,)), - ('utf-32', (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)), - ('utf-16', (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE)) - ): + for enc, boms in (('utf-8-sig', (codecs.BOM_UTF8,)), ('utf-32', (codecs.BOM_UTF32_LE, + codecs.BOM_UTF32_BE)), + ('utf-16', (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE))): if any(raw.startswith(bom) for bom in boms): return enc @@ -327,11 +310,26 @@ def detect_bom(path: str, default=None): def _get_auth_stats( - gitdir: str, branch: str = "HEAD", since=None, include_files=None, exclude_files=None, - silent_progress=False, ignore_whitespace=False, M=False, C=False, - warn_binary=False, bytype=False, show_email=False, prefix_gitdir=False, - churn=None, ignore_rev="", ignore_revs_file=None, until=None, processes: int = 1, - author_mapping_file_path: str = None, author_email_mapping_file_path: str = None, + gitdir: str, + branch: str = "HEAD", + since=None, + include_files=None, + exclude_files=None, + silent_progress=False, + ignore_whitespace=False, + M=False, + C=False, + warn_binary=False, + bytype=False, + show_email=False, + prefix_gitdir=False, + churn=None, + ignore_rev="", + ignore_revs_file=None, + until=None, + processes: int = 1, + author_mapping_file_path: str = None, + author_email_mapping_file_path: str = None, ): """Returns dict: {"": {"loc": int, "files": {}, "commits": int, "ctimes": [int]}}""" until = ["--until", until] if until else [] @@ -339,9 +337,8 @@ def _get_auth_stats( git_cmd = ["git", "-C", gitdir] log.debug("base command:%s", git_cmd) - file_list = check_output( - git_cmd + ["ls-files", "--eol", "--with-tree", branch] - ).strip().splitlines() + file_list = check_output(git_cmd + + ["ls-files", "--eol", "--with-tree", branch]).strip().splitlines() binary_file_list = [] text_file_list = [] for f in file_list: @@ -388,18 +385,15 @@ def _get_auth_stats( if C: base_cmd.extend(["-C", "-C"]) # twice to include file creation - auth_stats = defaultdict( - lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()} - ) # {author: {[loc,files,ctimes,exts]: + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()} + ) # {author: {[loc,files,ctimes,exts]: auth2em = defaultdict(set) - author_canonicalizer = _get_user_canonicalization_function( - author_mapping_file_path, author_email_mapping_file_path - ) + author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, + author_email_mapping_file_path) - def stats_append( - fname: str, auth: str, loc: int, tstamp: str, author_email: str, commit_id: str - ): + def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str, + commit_id: str): auth = author_canonicalizer(auth, author_email) tstamp = int(tstamp) @@ -422,16 +416,10 @@ def stats_append( completed = queue.Queue() def process_blame_out(commit_infos: Dict[str, _CommitInfo]): - for commit_id, cinfo in commit_infos.items(): # for each chunk + for commit_id, cinfo in commit_infos.items(): # for each chunk for fname, loc in cinfo.file_locs.items(): - stats_append( - fname, - cinfo.info['author'], - loc, - cinfo.info['committer-time'], - cinfo.info['author-mail'], - commit_id - ) + stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], + cinfo.info['author-mail'], commit_id) completed.put(None) @@ -444,26 +432,19 @@ def process_blame_out_error(fname, err): if prefix_gitdir: fname = path.join(gitdir, fname) - mp_pool.apply_async( - _get_blame_out, - args=(base_cmd, branch, fname, since, until), - callback=process_blame_out, - error_callback=partial(process_blame_out_error, fname) - ) - - for _ in tqdm( - file_list, desc=gitdir if prefix_gitdir else "Processing", - disable=silent_progress, unit="file" - ): + mp_pool.apply_async(_get_blame_out, args=(base_cmd, branch, fname, since, until), + callback=process_blame_out, + error_callback=partial(process_blame_out_error, fname)) + + for _ in tqdm(file_list, desc=gitdir if prefix_gitdir else "Processing", + disable=silent_progress, unit="file"): completed.get() mp_pool.close() mp_pool.join() else: - with tqdm( - total=1, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, - unit="repo" - ) as t: + with tqdm(total=1, desc=gitdir if prefix_gitdir else "Processing", disable=silent_progress, + unit="repo") as t: blame_out = check_output(base_cmd + [branch], stderr=subprocess.STDOUT) t.update() log.log(logging.NOTSET, blame_out) @@ -474,9 +455,8 @@ def process_blame_out_error(fname, err): blame_out = RE_STAT_BINARY.sub('', blame_out) blame_out = RE_AUTHS_LOG.split(blame_out) - blame_out = zip( - blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], blame_out[5::5] - ) + blame_out = zip(blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], + blame_out[5::5]) for auth, auth_email, commit_hash, tstamp, fnames in blame_out: fnames = fnames.split('\naN', 1)[0] for i in fnames.strip().split('\n'): @@ -497,15 +477,13 @@ def process_blame_out_error(fname, err): # quickly count commits (even if no surviving loc) log.log(logging.NOTSET, "authors:%s", list(auth_stats.keys())) - if show_email: # replace author name with email + if show_email: # replace author name with email log.debug(auth2em) old = auth_stats - auth_stats = defaultdict( - lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': 0} - ) + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': 0}) for auth, stats in getattr(old, 'iteritems', old.items)(): - auth_email = list(auth2em[auth])[0] # TODO: count most used email? + auth_email = list(auth2em[auth])[0] # TODO: count most used email? i = auth_stats[auth_email] i["loc"] += stats["loc"] i["files"].update(stats["files"]) From 4ce17263abf4863d2b3acbc24e1d2bf0e65057a3 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 18:59:28 -0800 Subject: [PATCH 12/15] yapf/flake fight resolution --- gitfame/_gitfame.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 0743d74..e013634 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -385,8 +385,7 @@ def _get_auth_stats( if C: base_cmd.extend(["-C", "-C"]) # twice to include file creation - auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()} - ) # {author: {[loc,files,ctimes,exts]: + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()}) auth2em = defaultdict(set) author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, @@ -416,7 +415,7 @@ def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str completed = queue.Queue() def process_blame_out(commit_infos: Dict[str, _CommitInfo]): - for commit_id, cinfo in commit_infos.items(): # for each chunk + for commit_id, cinfo in commit_infos.items(): for fname, loc in cinfo.file_locs.items(): stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], cinfo.info['author-mail'], commit_id) From 6813ba311cc0faab97bccc7c742d40aadb5338b5 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 21:40:37 -0800 Subject: [PATCH 13/15] output correct number of commits --- gitfame/_gitfame.py | 120 +++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 47 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index e013634..7174474 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -75,7 +75,7 @@ from functools import partial from os import path from pathlib import Path -from typing import Dict +from typing import Dict, Optional from ._utils import (TERM_WIDTH, Str, TqdmStream, check_output, fext, int_cast_or_len, merge_stats, print_unicode, tqdm) @@ -105,7 +105,6 @@ r'^\w+\s+\d+\s+\d+(\s+\d+)?\s*$[^\t]*?^boundary\s*$[^\t]*?^\t.*?$\r?\n', flags=re.M | re.DOTALL) # processing `log --format="aN%aN ct%ct" --numstat` -RE_AUTHS_LOG = re.compile(r"^aN(.+?) aE(.+?) H([a-f0-9]+) ct(\d+)\n\n", flags=re.M) RE_STAT_BINARY = re.compile(r"^\s*?-\s*-.*?\n", flags=re.M) RE_RENAME = re.compile(r"\{.+? => (.+?)\}") # finds all non-escaped commas @@ -224,7 +223,7 @@ def tabulate(auth_stats, stats_tot, sort='loc', bytype=False, backend='md', cost class _CommitInfo: def __init__(self): - self.file_locs = defaultdict(int) # {file_name: [loc, ... + self.file_locs = defaultdict(int) # {file_name: loc} self.info = {} @@ -232,25 +231,32 @@ def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, until) -> Dict[str, _CommitInfo]: blame_out = check_output(base_cmd + [branch, fname], stderr=subprocess.STDOUT) - commit_infos = defaultdict(_CommitInfo) # {commit: {file: commit_info - commit = None - loc = None + # Coalesces info by commit + commit_infos = defaultdict(_CommitInfo) + commit_info = loc = None for line in blame_out.splitlines(): if match := _RE_BLAME_START_LINE.match(line): commit = match['commit_hash'] - loc = int(match['lines_of_code']) # needs to be applied to each file of the commit - elif line.startswith('\t'): - continue + loc = int(match['lines_of_code']) + commit_info = commit_infos[commit] elif line == 'boundary': - continue + continue # TODO: is this ok? else: key, value = line.split(' ', 1) + if key in ('previous', 'summary'): + continue if key == 'filename': - commit_infos[commit].file_locs[value] += loc - else: - commit_infos[commit].info[key] = value + commit_info.file_locs[value] += loc + continue + + assert not line.startswith('\t') + if key == 'filename': + commit_info.file_locs[value] += loc + + assert key not in commit_info.info + commit_info.info[key] = value # TODO assert not since and not until @@ -271,6 +277,7 @@ def _get_blame_out(base_cmd: list[str], branch: str, fname: str, since, return dict(commit_infos) +# TODO: probably should be swapped to mailmap def _get_user_canonicalization_function(author_mapping_file_path: str = None, author_email_mapping_file_path: str = None): user_mappings = {} @@ -309,6 +316,12 @@ def detect_bom(path: str, default=None): return default +GIT_BLAME_FORMAT = "ct%ct H%H aE%aE aN%aN" +RE_AUTHS_LOG_COMMIT = re.compile( + r"^ct(?P\d*) H(?P[a-f0-9]*) aE(?P[^ ]*) aN(?P.+?)$") +RE_AUTHS_LOG_FILE = re.compile(r"^(?P\d+)\s+(?P\d+)\s+(?P.+?)$") + + def _get_auth_stats( gitdir: str, branch: str = "HEAD", @@ -317,12 +330,12 @@ def _get_auth_stats( exclude_files=None, silent_progress=False, ignore_whitespace=False, - M=False, - C=False, - warn_binary=False, - bytype=False, - show_email=False, - prefix_gitdir=False, + M: bool = False, + C: bool = False, + warn_binary: bool = False, + bytype: bool = False, + show_email: bool = False, + prefix_gitdir: bool = False, churn=None, ignore_rev="", ignore_revs_file=None, @@ -376,7 +389,7 @@ def _get_auth_stats( if ignore_revs_file: base_cmd.extend(["--ignore-revs-file", ignore_revs_file]) else: - base_cmd = git_cmd + ["log", "--format=aN%aN aE%aE H%H ct%ct", "--numstat"] + since + until + base_cmd = git_cmd + ["log", f"--format={GIT_BLAME_FORMAT}", "--numstat"] + since + until if ignore_whitespace: base_cmd.append("-w") @@ -385,14 +398,13 @@ def _get_auth_stats( if C: base_cmd.extend(["-C", "-C"]) # twice to include file creation - auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': set()}) + auth_stats = defaultdict(lambda: {'loc': 0, 'files': set(), 'ctimes': [], 'commits': 0}) auth2em = defaultdict(set) author_canonicalizer = _get_user_canonicalization_function(author_mapping_file_path, author_email_mapping_file_path) - def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str, - commit_id: str): + def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str): auth = author_canonicalizer(auth, author_email) tstamp = int(tstamp) @@ -402,7 +414,8 @@ def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str i["loc"] += loc i["files"].add(fname) i["ctimes"].append(tstamp) - i['commits'].add(commit_id) + # NOTE: we could add all the commits here, that would equate to how many commits + # the author has that contain code still visible in the working branch if bytype: fext_key = f".{fext(fname) or '_None_ext'}" @@ -415,10 +428,10 @@ def stats_append(fname: str, auth: str, loc: int, tstamp: str, author_email: str completed = queue.Queue() def process_blame_out(commit_infos: Dict[str, _CommitInfo]): - for commit_id, cinfo in commit_infos.items(): + for cinfo in commit_infos.values(): for fname, loc in cinfo.file_locs.items(): stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], - cinfo.info['author-mail'], commit_id) + cinfo.info['author-mail']) completed.put(None) @@ -451,30 +464,43 @@ def process_blame_out_error(fname, err): # Strip binary files for fname in set(RE_STAT_BINARY.findall(blame_out)): getattr(log, "warn" if warn_binary else "debug")("binary:%s", fname.strip()) - blame_out = RE_STAT_BINARY.sub('', blame_out) - - blame_out = RE_AUTHS_LOG.split(blame_out) - blame_out = zip(blame_out[1::5], blame_out[2::5], blame_out[3::5], blame_out[4::5], - blame_out[5::5]) - for auth, auth_email, commit_hash, tstamp, fnames in blame_out: - fnames = fnames.split('\naN', 1)[0] - for i in fnames.strip().split('\n'): - try: - inss, dels, fname = i.split('\t') - except ValueError: - log.warning(i) - else: - fname = RE_RENAME.sub(r'\\2', fname) - loc = (int(inss) if churn & CHURN_INS and inss else - 0) + (int(dels) if churn & CHURN_DEL and dels else 0) - stats_append(fname, auth, int(loc), tstamp, auth_email, commit_hash) - - # translate commit-ids to # of commits - for astat in auth_stats.values(): - astat['commits'] = len(astat['commits']) + lines = RE_STAT_BINARY.sub('', blame_out).splitlines() + + commit_infos = defaultdict(_CommitInfo) + commit_info: Optional[_CommitInfo] = None + for line_num, line in enumerate(lines): + if not line: + continue + + if m := RE_AUTHS_LOG_COMMIT.match(line): + commit = m['commit'] + commit_info = commit_infos[commit] + commit_info.info.update({ + 'author': m['author'], + 'author-mail': m['auth_email'], + 'committer-time': m['timestamp'],}) + elif m := RE_AUTHS_LOG_FILE.match(line): + fname = RE_RENAME.sub(r'\\2', m['fname']) + inss, dels = m['inserts'], m['deletes'] + loc = (int(inss) if churn & CHURN_INS and inss else + 0) + (int(dels) if churn & CHURN_DEL and dels else 0) + + commit_info.file_locs[fname] += loc + else: + assert False, f'error parsing blame line ({line_num}): {line}' + + for cinfo in commit_infos.values(): + for fname, loc in cinfo.file_locs.items(): + stats_append(fname, cinfo.info['author'], loc, cinfo.info['committer-time'], + cinfo.info['author-mail']) # quickly count commits (even if no surviving loc) log.log(logging.NOTSET, "authors:%s", list(auth_stats.keys())) + auth_commits = check_output(git_cmd + ["shortlog", "-s", "-e", branch] + since + until) + for (ncom, auth, em) in RE_NCOM_AUTH_EM.findall(auth_commits.strip()): + auth = author_canonicalizer(auth, em) + auth_stats[auth]['commits'] += int(ncom) + auth2em[auth].add(em) if show_email: # replace author name with email log.debug(auth2em) From b89f6bb3245bdad29aea5b73dce82e9264d3f888 Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 22:20:41 -0800 Subject: [PATCH 14/15] fix eol detection --- gitfame/_gitfame.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 7174474..39b43af 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -298,10 +298,6 @@ def canonicalize(author: str, author_email: str): return canonicalize -_RE_EOL_LINE = re.compile( - r'^(?P[^ \t]+)+\s+(?P[^ \t]+)\s+(?P[^ \t]+)\s+(?P.*)$') - - def detect_bom(path: str, default=None): with open(path, 'rb') as f: raw = f.read(4) # will read less if the file is smaller @@ -351,12 +347,11 @@ def _get_auth_stats( log.debug("base command:%s", git_cmd) file_list = check_output(git_cmd + - ["ls-files", "--eol", "--with-tree", branch]).strip().splitlines() + ["ls-files", "--format=%(eolinfo:index)|%(eolinfo:worktree)|%(eolattr)|%(path)", "--with-tree", branch]).strip().splitlines() binary_file_list = [] text_file_list = [] for f in file_list: - m = _RE_EOL_LINE.match(f) - fpath = m['fpath'] + _, eol_worktree, _, fpath = f.split('|', 3) if not hasattr(include_files, 'search'): if (include_files and fpath not in include_files) or fpath in exclude_files: @@ -364,7 +359,7 @@ def _get_auth_stats( elif (not include_files.search(fpath)) or (exclude_files and exclude_files.search(fpath)): continue - if m['eol_worktree'] == 'w/-text': + if eol_worktree == '-text': binary_file_list.append(fpath) else: text_file_list.append(fpath) From 0f5979e8e1d2d8b399217d1b5bca6a9f4984088b Mon Sep 17 00:00:00 2001 From: Alexander Mohr Date: Thu, 21 Nov 2024 22:30:12 -0800 Subject: [PATCH 15/15] flake / yapf --- gitfame/_gitfame.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gitfame/_gitfame.py b/gitfame/_gitfame.py index 39b43af..a90b6bc 100755 --- a/gitfame/_gitfame.py +++ b/gitfame/_gitfame.py @@ -346,8 +346,10 @@ def _get_auth_stats( git_cmd = ["git", "-C", gitdir] log.debug("base command:%s", git_cmd) - file_list = check_output(git_cmd + - ["ls-files", "--format=%(eolinfo:index)|%(eolinfo:worktree)|%(eolattr)|%(path)", "--with-tree", branch]).strip().splitlines() + file_list = check_output(git_cmd + [ + "ls-files", "--format=%(eolinfo:index)|%(eolinfo:worktree)|%(eolattr)|%(path)", + "--with-tree", branch]).strip().splitlines() + binary_file_list = [] text_file_list = [] for f in file_list: @@ -471,9 +473,8 @@ def process_blame_out_error(fname, err): commit = m['commit'] commit_info = commit_infos[commit] commit_info.info.update({ - 'author': m['author'], - 'author-mail': m['auth_email'], - 'committer-time': m['timestamp'],}) + 'author': m['author'], 'author-mail': m['auth_email'], + 'committer-time': m['timestamp']}) elif m := RE_AUTHS_LOG_FILE.match(line): fname = RE_RENAME.sub(r'\\2', m['fname']) inss, dels = m['inserts'], m['deletes'] @@ -482,7 +483,7 @@ def process_blame_out_error(fname, err): commit_info.file_locs[fname] += loc else: - assert False, f'error parsing blame line ({line_num}): {line}' + raise AssertionError(f'error parsing blame line ({line_num}): {line}') for cinfo in commit_infos.values(): for fname, loc in cinfo.file_locs.items():