Skip to content

Commit 7e3a545

Browse files
authored
Use GraphQL API for bot detection (#146)
* Use GraphQL API for bot detection * Add some docs about bot behavior * Fix CI: Use GITHUB_TOKEN fallback for PR tests Use GITHUB_TOKEN when TOKEN_READONLY secret isn't available (PRs from forks). This allows tests to run on all PRs while still using the higher-limit PAT when available on pushed branches. * Use DataFrame.attrs instead of direct attribute assignment Fixes pandas UserWarning about creating columns via attribute names. Uses the proper pandas API for storing metadata on DataFrames.
1 parent 89a921b commit 7e3a545

File tree

4 files changed

+58
-35
lines changed

4 files changed

+58
-35
lines changed

.github/workflows/tests.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@ jobs:
3535
tests:
3636
runs-on: ubuntu-24.04
3737
env:
38-
# This is a PAT for @choldgraf that only has read-access to this repo.
39-
# We use it to avoid query limits.
40-
GITHUB_ACCESS_TOKEN: "${{ secrets.TOKEN_READONLY }}"
38+
# Use TOKEN_READONLY if available (pushed branches), otherwise use GITHUB_TOKEN (PRs)
39+
# TOKEN_READONLY is a PAT for @choldgraf that only has read-access to this repo but isn't available in PRs.
40+
GITHUB_ACCESS_TOKEN: "${{ secrets.TOKEN_READONLY || secrets.GITHUB_TOKEN }}"
4141
strategy:
4242
matrix:
4343
include:

docs/use.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,17 +140,17 @@ To include Issues and Pull Requests that were _opened_ in a time period, use the
140140

141141
## Remove bots from the changelog
142142

143-
`github-activity` ships with a known list of bot usernames, but your project may use ones not on our list.
144-
To ignore additional usernames from the changelog, use the `--ignore-contributor` flag:
143+
`github-activity` automatically detects and excludes bot accounts using GitHub's API.
144+
Bot accounts (like `dependabot`, `github-actions`, etc.) are identified by their account type in GitHub's data.
145+
146+
To ignore additional human contributors from the changelog, use the `--ignore-contributor` flag:
145147

146148
```
147-
github-activity ... --ignore-contributor robot-one --ignore-contributor 'robot-two*'
149+
github-activity ... --ignore-contributor user-one --ignore-contributor 'test-user-*'
148150
```
149151

150152
Wildcards are matched as per [filename matching semantics](https://docs.python.org/3/library/fnmatch.html#fnmatch.fnmatch).
151153

152-
If this is a generic bot username, consider contributing it back to [our list](https://github.com/executablebooks/github-activity/blob/main/github_activity/github_activity.py#L73).
153-
154154
## Use a GitHub API token
155155

156156
`github-activity` uses the GitHub API to pull information about a repository's activity.

github_activity/github_activity.py

Lines changed: 9 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -96,29 +96,6 @@
9696
]
9797
)
9898

99-
# exclude known bots from contributor lists
100-
# Also see 'ignore-contributor' flag/configuration option.
101-
BOT_USERS = {
102-
"changeset-bot*",
103-
"codecov*",
104-
"codecov-io*",
105-
"dependabot*",
106-
"github-actions*",
107-
"henchbot*",
108-
"jupyterlab-dev-mode*",
109-
"lgtm-com*",
110-
"meeseeksmachine*",
111-
"names*",
112-
"now*",
113-
"pre-commit-ci*",
114-
"renovate*",
115-
"review-notebook-app*",
116-
"support*",
117-
"stale*",
118-
"todo*",
119-
"welcome*",
120-
}
121-
12299

123100
def get_activity(
124101
target, since, until=None, repo=None, kind=None, auth=None, cache=None
@@ -484,10 +461,15 @@ def generate_activity_md(
484461
# add column for participants in each issue (not just original author)
485462
data["contributors"] = [[]] * len(data)
486463

464+
# Get bot users from GraphQL data (stored in DataFrame attrs)
465+
bot_users = data.attrs.get("bot_users", set())
466+
487467
def ignored_user(username):
488-
return any(fnmatch.fnmatch(username, bot) for bot in BOT_USERS) or any(
489-
fnmatch.fnmatch(username, user) for user in ignored_contributors
490-
)
468+
if username in bot_users:
469+
return True
470+
if any(fnmatch.fnmatch(username, user) for user in ignored_contributors):
471+
return True
472+
return False
491473

492474
def filter_ignored(userlist):
493475
return {user for user in userlist if not ignored_user(user)}
@@ -525,7 +507,7 @@ def filter_ignored(userlist):
525507

526508
comment_author = comment_author["login"]
527509
if ignored_user(comment_author):
528-
# ignore bots
510+
# ignore bots and user-specified contributors
529511
continue
530512

531513
# Add to list of commenters on items they didn't author

github_activity/graphql.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
url
1919
author {
2020
login
21+
__typename
2122
}
2223
}
2324
}
@@ -31,6 +32,7 @@
3132
authorAssociation
3233
author {
3334
login
35+
__typename
3436
}
3537
}
3638
}
@@ -81,6 +83,7 @@
8183
authorAssociation
8284
author {
8385
login
86+
__typename
8487
}
8588
reactions(content: THUMBS_UP) {
8689
totalCount
@@ -100,6 +103,7 @@
100103
{base_elements}
101104
mergedBy {{
102105
login
106+
__typename
103107
}}
104108
mergeCommit {{
105109
oid
@@ -233,8 +237,45 @@ def request(self, n_pages=100, n_per_page=50):
233237
if not pageInfo["hasNextPage"]:
234238
break
235239

240+
# Extract bot users from raw data before DataFrame conversion
241+
def is_bot(user_dict):
242+
"""Check if a GraphQL user object represents a bot account."""
243+
if not user_dict:
244+
return False
245+
return user_dict.get("__typename") == "Bot"
246+
247+
bot_users = set()
248+
for item in self.issues_and_or_prs:
249+
# Check author
250+
author = item.get("author")
251+
if is_bot(author):
252+
bot_users.add(author["login"])
253+
254+
# Check mergedBy
255+
merged_by = item.get("mergedBy")
256+
if is_bot(merged_by):
257+
bot_users.add(merged_by["login"])
258+
259+
# Check reviewers
260+
reviews = item.get("reviews")
261+
if reviews:
262+
for review in reviews.get("edges", []):
263+
review_author = review["node"].get("author")
264+
if is_bot(review_author):
265+
bot_users.add(review_author["login"])
266+
267+
# Check commenters
268+
comments = item.get("comments")
269+
if comments:
270+
for comment in comments.get("edges", []):
271+
comment_author = comment["node"].get("author")
272+
if is_bot(comment_author):
273+
bot_users.add(comment_author["login"])
274+
236275
# Create a dataframe of the issues and/or PRs
237276
self.data = pd.DataFrame(self.issues_and_or_prs)
277+
# Store bot users in DataFrame metadata (attrs dict)
278+
self.data.attrs["bot_users"] = bot_users
238279

239280
# Add some extra fields
240281
def get_login(user):

0 commit comments

Comments
 (0)