Skip to content

Commit 8f81d69

Browse files
Merge pull request #3 from GitHubSecurityLab/codeql-python
Add CodeQL for Python MCP and taskflow
2 parents 0c7aac2 + 44941a5 commit 8f81d69

File tree

12 files changed

+553
-22
lines changed

12 files changed

+553
-22
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
Queries in support of the CodeQL MCP Server are maintained as query packs.
2+
3+
If you add your own queries, please follow established conventions for normal CodeQL query pack development.
4+
5+
To run the CodeQL for Python server:
6+
- create a codespace, preferably with more cores
7+
- install CodeQL extension for VS Code
8+
- press `Ctrl/Cmd + Shift + P` and type "CodeQL: Install Pack Dependencies". Choose "sylwia-budzynska/mcp-python" and press "OK".
9+
- find the path to the codeql binary, which comes preinstalled with the VS Code CodeQL extension, with the command:
10+
```bash
11+
find ~ -type f -name codeql -executable 2>/dev/null
12+
```
13+
It will most likely look similar to this:
14+
```
15+
/home/codespace/.vscode-remote/data/User/globalStorage/github.vscode-codeql/distribution1/codeql/codeql
16+
```
17+
- create a folder named 'data'
18+
- create or update your `.env` file in the root of this project with values for:
19+
```
20+
COPILOT_TOKEN= # a fine-grained GitHub personal access token with permission for "copilot chat"
21+
CODEQL_DBS_BASE_PATH="/workspaces/seclab-taskflows/data/codeql_databases" # path to folder with your CodeQL databases
22+
23+
# Example values for a local setup, run with `python -m seclab_taskflow_agent -t seclab_taskflows.taskflows.audit.remote_sources_local`
24+
MEMCACHE_STATE_DIR="/workspaces/seclab-taskflows/data" # path to folder for storing the memcache database
25+
DATA_DIR="/workspaces/seclab-taskflows/data" # path to folder for storing the codeql_sqlite databases and all other data
26+
GITHUB_PERSONAL_ACCESS_TOKEN= # can be the same token as COPILOT_TOKEN. Or another one, with access e.g. to private repositories
27+
CODEQL_CLI= # output of command `find ~ -type f -name codeql -executable 2>/dev/null`
28+
29+
# Example docker env run with ./run_seclab_agent.sh [...]
30+
# CODEQL_CLI="codeql"
31+
# CODEQL_DBS_BASE_PATH="/app/data/codeql_databases"
32+
# MEMCACHE_STATE_DIR="/app/data"
33+
# DATA_DIR="/app/data"
34+
```
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# SPDX-FileCopyrightText: 2025 GitHub
2+
# SPDX-License-Identifier: MIT
3+
4+
from sqlalchemy import Text
5+
from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped
6+
from typing import Optional
7+
8+
class Base(DeclarativeBase):
9+
pass
10+
11+
12+
class Source(Base):
13+
__tablename__ = 'source'
14+
15+
id: Mapped[int] = mapped_column(primary_key=True)
16+
repo: Mapped[str]
17+
source_location: Mapped[str]
18+
line: Mapped[int]
19+
source_type: Mapped[str]
20+
notes: Mapped[Optional[str]] = mapped_column(Text, nullable=True)
21+
22+
def __repr__(self):
23+
return (f"<Source(id={self.id}, repo={self.repo}, "
24+
f"location={self.source_location}, line={self.line}, source_type={self.source_type}, "
25+
f"notes={self.notes})>")
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
# SPDX-FileCopyrightText: 2025 GitHub
2+
# SPDX-License-Identifier: MIT
3+
4+
5+
import logging
6+
logging.basicConfig(
7+
level=logging.DEBUG,
8+
format='%(asctime)s - %(levelname)s - %(message)s',
9+
filename='logs/mcp_codeql_python.log',
10+
filemode='a'
11+
)
12+
from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, _debug_log
13+
# from seclab_taskflow_agent.path_utils import mcp_data_dir
14+
15+
from pydantic import Field
16+
#from mcp.server.fastmcp import FastMCP, Context
17+
from fastmcp import FastMCP # use FastMCP 2.0
18+
from pathlib import Path
19+
import os
20+
import csv
21+
import json
22+
from sqlalchemy import create_engine
23+
from sqlalchemy.orm import Session
24+
import subprocess
25+
import importlib.resources
26+
27+
from .codeql_sqlite_models import Base, Source
28+
from ..utils import process_repo
29+
30+
MEMORY = Path(os.getenv('DATA_DIR', default='/app/data'))
31+
CODEQL_DBS_BASE_PATH = Path(os.getenv('CODEQL_DBS_BASE_PATH', default='/app/data'))
32+
# MEMORY = mcp_data_dir('seclab-taskflows', 'codeql', 'DATA_DIR')
33+
# CODEQL_DBS_BASE_PATH = mcp_data_dir('seclab-taskflows', 'codeql', 'CODEQL_DBS_BASE_PATH')
34+
35+
mcp = FastMCP("CodeQL-Python")
36+
37+
# tool name -> templated query lookup for supported languages
38+
TEMPLATED_QUERY_PATHS = {
39+
# to add a language, port the templated query pack and add its definition here
40+
'python': {
41+
'remote_sources': 'queries/mcp-python/remote_sources.ql'
42+
}
43+
}
44+
45+
46+
def source_to_dict(result):
47+
return {
48+
"source_id": result.id,
49+
"repo": result.repo,
50+
"source_location": result.source_location,
51+
"line": result.line,
52+
"source_type": result.source_type,
53+
"notes": result.notes
54+
}
55+
56+
def _resolve_query_path(language: str, query: str) -> Path:
57+
global TEMPLATED_QUERY_PATHS
58+
if language not in TEMPLATED_QUERY_PATHS:
59+
raise RuntimeError(f"Error: Language `{language}` not supported!")
60+
query_path = TEMPLATED_QUERY_PATHS[language].get(query)
61+
if not query_path:
62+
raise RuntimeError(f"Error: query `{query}` not supported for `{language}`!")
63+
return Path(query_path)
64+
65+
66+
def _resolve_db_path(relative_db_path: str | Path):
67+
global CODEQL_DBS_BASE_PATH
68+
# path joins will return "/B" if "/A" / "////B" etc. as well
69+
# not windows compatible and probably needs additional hardening
70+
relative_db_path = str(relative_db_path).strip().lstrip('/')
71+
relative_db_path = Path(relative_db_path)
72+
absolute_path = (CODEQL_DBS_BASE_PATH / relative_db_path).resolve()
73+
if not absolute_path.is_relative_to(CODEQL_DBS_BASE_PATH.resolve()):
74+
raise RuntimeError(f"Error: Database path {absolute_path} is outside the base path {CODEQL_DBS_BASE_PATH}")
75+
if not absolute_path.is_dir():
76+
_debug_log(f"Database path not found: {absolute_path}")
77+
raise RuntimeError(f"Error: Database not found at {absolute_path}!")
78+
return str(absolute_path)
79+
80+
# This sqlite database is specifically made for CodeQL for Python MCP.
81+
class CodeqlSqliteBackend:
82+
def __init__(self, memcache_state_dir: str):
83+
self.memcache_state_dir = memcache_state_dir
84+
if not Path(self.memcache_state_dir).exists():
85+
db_dir = 'sqlite://'
86+
else:
87+
db_dir = f'sqlite:///{self.memcache_state_dir}/codeql_sqlite.db'
88+
self.engine = create_engine(db_dir, echo=False)
89+
Base.metadata.create_all(self.engine, tables=[Source.__table__])
90+
91+
92+
def store_new_source(self, repo, source_location, line, source_type, notes, update = False):
93+
with Session(self.engine) as session:
94+
existing = session.query(Source).filter_by(repo = repo, source_location = source_location, line = line).first()
95+
if existing:
96+
existing.notes = (existing.notes or "") + notes
97+
session.commit()
98+
return f"Updated notes for source at {source_location}, line {line} in {repo}."
99+
else:
100+
if update:
101+
return f"No source exists at repo {repo}, location {source_location}, line {line} to update."
102+
new_source = Source(repo = repo, source_location = source_location, line = line, source_type = source_type, notes = notes)
103+
session.add(new_source)
104+
session.commit()
105+
return f"Added new source for {source_location} in {repo}."
106+
107+
def get_sources(self, repo):
108+
with Session(self.engine) as session:
109+
results = session.query(Source).filter_by(repo = repo).all()
110+
sources = [source_to_dict(source) for source in results]
111+
return sources
112+
113+
114+
# our query result format is: "human readable template {val0} {val1},'key0,key1',val0,val1"
115+
def _csv_parse(raw):
116+
results = []
117+
reader = csv.reader(raw.strip().splitlines())
118+
try:
119+
for i, row in enumerate(reader):
120+
if i == 0:
121+
continue
122+
# col1 has what we care about, but offer flexibility
123+
keys = row[1].split(',')
124+
this_obj = {'description': row[0].format(*row[2:])}
125+
for j, k in enumerate(keys):
126+
this_obj[k.strip()] = row[j + 2]
127+
results.append(this_obj)
128+
except (csv.Error, IndexError, ValueError) as e:
129+
return f"Error: CSV parsing error: {e}"
130+
return results
131+
132+
133+
def _run_query(query_name: str, database_path: str, language: str, template_values: dict):
134+
"""Run a CodeQL query and return the results"""
135+
136+
try:
137+
database_path = _resolve_db_path(database_path)
138+
except RuntimeError:
139+
return f"The database path for {database_path} could not be resolved"
140+
try:
141+
query_path = _resolve_query_path(language, query_name)
142+
except RuntimeError:
143+
return f"The query {query_name} is not supported for language: {language}"
144+
try:
145+
csv = run_query(Path(__file__).parent.resolve() /
146+
query_path,
147+
database_path,
148+
fmt='csv',
149+
template_values=template_values,
150+
log_stderr=True)
151+
return _csv_parse(csv)
152+
except Exception as e:
153+
return f"The query {query_name} encountered an error: {e}"
154+
155+
backend = CodeqlSqliteBackend(MEMORY)
156+
157+
@mcp.tool()
158+
def remote_sources(owner: str = Field(description="The owner of the GitHub repository"),
159+
repo: str = Field(description="The name of the GitHub repository"),
160+
database_path: str = Field(description="The CodeQL database path."),
161+
language: str = Field(description="The language used for the CodeQL database.")):
162+
"""List all remote sources and their locations in a CodeQL database, then store the results in a database."""
163+
164+
repo = process_repo(owner, repo)
165+
results = _run_query('remote_sources', database_path, language, {})
166+
167+
# Check if results is an error (list of strings) or valid data (list of dicts)
168+
if isinstance(results, str):
169+
return f"Error: {results}"
170+
171+
# Store each result as a source
172+
stored_count = 0
173+
for result in results:
174+
backend.store_new_source(
175+
repo=repo,
176+
source_location=result.get('location', ''),
177+
source_type=result.get('source', ''),
178+
line=int(result.get('line', '0')),
179+
notes=None, #result.get('description', ''),
180+
update=False
181+
)
182+
stored_count += 1
183+
184+
return f"Stored {stored_count} remote sources in {repo}."
185+
186+
@mcp.tool()
187+
def fetch_sources(owner: str = Field(description="The owner of the GitHub repository"),
188+
repo: str = Field(description="The name of the GitHub repository")):
189+
"""
190+
Fetch all sources from the repo
191+
"""
192+
repo = process_repo(owner, repo)
193+
return json.dumps(backend.get_sources(repo))
194+
195+
@mcp.tool()
196+
def add_source_notes(owner: str = Field(description="The owner of the GitHub repository"),
197+
repo: str = Field(description="The name of the GitHub repository"),
198+
source_location: str = Field(description="The path to the file"),
199+
line: int = Field(description="The line number of the source"),
200+
notes: str = Field(description="The notes to append to this source")):
201+
"""
202+
Add new notes to an existing source. The notes will be appended to any existing notes.
203+
"""
204+
repo = process_repo(owner, repo)
205+
return backend.store_new_source(repo = repo, source_location = source_location, line = line, source_type = "", notes = notes, update=True)
206+
207+
@mcp.tool()
208+
def clear_codeql_repo(owner: str = Field(description="The owner of the GitHub repository"),
209+
repo: str = Field(description="The name of the GitHub repository")):
210+
"""
211+
Clear all data for a given repo from the database
212+
"""
213+
repo = process_repo(owner, repo)
214+
with Session(backend.engine) as session:
215+
deleted_sources = session.query(Source).filter_by(repo = repo).delete()
216+
session.commit()
217+
return f"Cleared {deleted_sources} sources from repo {repo}."
218+
219+
if __name__ == "__main__":
220+
# Check if codeql/python-all pack is installed, if not install it
221+
if not os.path.isdir('/.codeql/packages/codeql/python-all'):
222+
pack_path = importlib.resources.files('seclab_taskflows.mcp_servers.codeql_python.queries').joinpath('mcp-python')
223+
print(f"Installing CodeQL pack from {pack_path}")
224+
subprocess.run(["codeql", "pack", "install", pack_path])
225+
mcp.run(show_banner=False, transport="http", host="127.0.0.1", port=9998)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
---
2+
lockVersion: 1.0.0
3+
dependencies:
4+
codeql/concepts:
5+
version: 0.0.8
6+
codeql/controlflow:
7+
version: 2.0.18
8+
codeql/dataflow:
9+
version: 2.0.18
10+
codeql/mad:
11+
version: 1.0.34
12+
codeql/python-all:
13+
version: 4.1.0
14+
codeql/regex:
15+
version: 1.0.34
16+
codeql/ssa:
17+
version: 2.0.10
18+
codeql/threat-models:
19+
version: 1.0.34
20+
codeql/tutorial:
21+
version: 1.0.34
22+
codeql/typetracking:
23+
version: 2.0.18
24+
codeql/util:
25+
version: 2.0.21
26+
codeql/xml:
27+
version: 1.0.34
28+
codeql/yaml:
29+
version: 1.0.34
30+
compiled: false
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
library: false
3+
warnOnImplicitThis: false
4+
name: sylwia-budzynska/mcp-python
5+
version: 0.0.1
6+
dependencies:
7+
codeql/python-all: ^4.1.0
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
/**
2+
* This is an automatically generated file
3+
* @name Hello world
4+
* @kind problem
5+
* @problem.severity warning
6+
* @id python/example/hello-world
7+
*/
8+
9+
import python
10+
11+
from File f
12+
select f, "Hello, world!"
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/**
2+
* @id mcp-python/remote-sources
3+
* @name Python Remote Sources
4+
* @description Identifies nodes that act as remote sources in Python code, along with their locations.
5+
* @tags source, location
6+
*/
7+
import python
8+
import semmle.python.dataflow.new.RemoteFlowSources
9+
10+
// string normalizeLocation(Location l) {
11+
// result = l.getFile().getRelativePath() + ":" + l.getStartLine().toString() + ":" + l.getStartColumn().toString()
12+
// + ":" + l.getEndLine().toString() + ":" + l.getEndColumn().toString()
13+
// }
14+
15+
from RemoteFlowSource source
16+
select
17+
"Remote source {0} is defined at {1} line {2}",
18+
"source,location,line",
19+
source.getSourceType(),
20+
source.getLocation().getFile().getRelativePath(),
21+
source.getLocation().getStartLine().toString()

0 commit comments

Comments
 (0)