From d6ea16158ab14bfdbf90961a33df467b96697150 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:42:49 +0000 Subject: [PATCH 01/26] Ignore .env files in any directory --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3d413d4..bd801df 100644 --- a/.gitignore +++ b/.gitignore @@ -134,7 +134,7 @@ celerybeat.pid *.sage.py # Environments -.env +*.env .venv env/ venv/ From bad9ab43438669d12e7b78258db285b0ed7b922d Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:47:17 +0000 Subject: [PATCH 02/26] Add CodeQL-Python MCP tool --- .../mcp_servers/codeql_python/README.md | 34 +++ .../codeql_python/codeql_sqlite_models.py | 25 ++ .../mcp_servers/codeql_python/mcp_server.py | 260 ++++++++++++++++++ .../queries/mcp-python/codeql-pack.lock.yml | 30 ++ .../queries/mcp-python/codeql-pack.yml | 7 + .../queries/mcp-python/example.ql | 12 + .../queries/mcp-python/remote_sources.ql | 17 ++ .../toolboxes/codeql_python.yaml | 79 ++++++ 8 files changed, 464 insertions(+) create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/README.md create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.lock.yml create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.yml create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/example.ql create mode 100644 src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql create mode 100644 src/seclab_taskflows/toolboxes/codeql_python.yaml diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/README.md b/src/seclab_taskflows/mcp_servers/codeql_python/README.md new file mode 100644 index 0000000..2184ce0 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/README.md @@ -0,0 +1,34 @@ +Queries in support of the CodeQL MCP Server are maintained as query packs. + +If you add your own queries, please follow established conventions for normal CodeQL query pack development. + +To run the CodeQL for Python server: +- create a codespace, preferably with more cores +- install CodeQL extension for VS Code +- press `Ctrl/Cmd + Shift + P` and type "CodeQL: Install Pack Dependencies". Choose "sylwia-budzynska/mcp-python" and press "OK". +- find the path to the codeql binary, which comes preinstalled with the VS Code CodeQL extension, with the command: +```bash +find ~ -type f -name codeql -executable 2>/dev/null +``` +It will most likely look similar to this: +``` +/home/codespace/.vscode-remote/data/User/globalStorage/github.vscode-codeql/distribution1/codeql/codeql +``` +- create a folder named 'data' +- create or update your `.env` file in the root of this project with values for: +``` +COPILOT_TOKEN= # a fine-grained GitHub personal access token with permssion for "copilot chat" +CODEQL_DBS_BASE_PATH="/workspaces/seclab-taskflows/data/codeql_databases" #path to folder with your CodeQL databases + +# Example values for a local setup, run with `python -m seclab_taskflow_agent -t seclab_taskflows.taskflows.audit.remote_sources_local` +MEMCACHE_STATE_DIR="/workspaces/seclab-taskflows/data" # path to folder for storing the memcache database +DATA_DIR="/workspaces/seclab-taskflows/data" # path to folder for storing the codeql_sqlite databases and all other data +GITHUB_PERSONAL_ACCESS_TOKEN= # can be the same token as COPILOT_TOKEN. Or another one, with access e.g. to private repositories +CODEQL_CLI= # output of command `find ~ -type f -name codeql -executable 2>/dev/null` + +# Example docker env run with ./run_seclab_agent.sh [...] +# CODEQL_CLI="codeql" +# CODEQL_DBS_BASE_PATH="/app/data/codeql_databases" +# MEMCACHE_STATE_DIR="/app/data" +# DATA_DIR="/app/data" +``` diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py new file mode 100644 index 0000000..5d8b0e7 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + +from sqlalchemy import String, Text, Integer, ForeignKey, Column +from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped, relationship +from typing import Optional + +class Base(DeclarativeBase): + pass + + +class Source(Base): + __tablename__ = 'source' + + id: Mapped[int] = mapped_column(primary_key=True) + repo: Mapped[str] + source_location: Mapped[str] + type: Mapped[str] + notes: Mapped[Optional[str]] = mapped_column(Text, nullable=True) + + def __repr__(self): + return (f"") diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py new file mode 100644 index 0000000..3fd1b95 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -0,0 +1,260 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + + +import logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(levelname)s - %(message)s', + filename='logs/mcp_codeql_python.log', + filemode='a' +) +from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, file_from_uri, list_src_files, _debug_log, search_in_src_archive + +from pydantic import Field +#from mcp.server.fastmcp import FastMCP, Context +from fastmcp import FastMCP, Context # use FastMCP 2.0 +from pathlib import Path +import os +import csv +import json +from sqlalchemy import create_engine +from sqlalchemy.orm import Session +from pathlib import Path +import zipfile +import httpx +import aiofiles +from .codeql_sqlite_models import Base, Source + +MEMORY = Path(os.getenv('CODEQL_SQLITE_DIR', default='/app/my_data')) +mcp = FastMCP("CodeQL-Python") + +CODEQL_DBS_BASE_PATH = Path(os.getenv('CODEQL_DBS_BASE_PATH', default='/workspaces/seclab-taskflow-agent/my_data')) + +# tool name -> templated query lookup for supported languages +TEMPLATED_QUERY_PATHS = { + # to add a language, port the templated query pack and add its definition here + 'python': { + 'remote_sources': 'queries/mcp-python/remote_sources.ql' + } +} + + +def source_to_dict(result): + return { + "source_id": result.id, + "repo": result.repo, + "source_location": result.source_location, + "type": result.type, + "notes": result.notes + } + +def _resolve_query_path(language: str, query: str) -> Path: + global TEMPLATED_QUERY_PATHS + if language not in TEMPLATED_QUERY_PATHS: + raise RuntimeError(f"Error: Language `{language}` not supported!") + query_path = TEMPLATED_QUERY_PATHS[language].get(query) + if not query_path: + raise RuntimeError(f"Error: query `{query}` not supported for `{language}`!") + return Path(query_path) + + +def _resolve_db_path(relative_db_path: str | Path): + global CODEQL_DBS_BASE_PATH + # path joins will return "/B" if "/A" / "////B" etc. as well + # not windows compatible and probably needs additional hardening + relative_db_path = str(relative_db_path).strip().lstrip('/') + relative_db_path = Path(relative_db_path) + absolute_path = CODEQL_DBS_BASE_PATH / relative_db_path + if not absolute_path.is_dir(): + _debug_log(f"Database path not found: {absolute_path}") + raise RuntimeError(f"Error: Database not found at {absolute_path}!") + return str(absolute_path) + +# This sqlite database is specifically made for CodeQL for Python MCP. +class CodeqlSqliteBackend: + def __init__(self, memcache_state_dir: str): + self.memcache_state_dir = memcache_state_dir + self.location_pattern = r'^([a-zA-Z]+)(:\d+){4}$' + if not Path(self.memcache_state_dir).exists(): + db_dir = 'sqlite://' + else: + db_dir = f'sqlite:///{self.memcache_state_dir}/codeql_sqlite.db' + self.engine = create_engine(db_dir, echo=False) + Base.metadata.create_all(self.engine, tables=[Source.__table__]) + + + def store_new_source(self, repo, source_location, type, notes, update = False): + with Session(self.engine) as session: + existing = session.query(Source).filter_by(repo = repo, source_location = source_location).first() + if existing: + existing.notes += notes + session.commit() + return f"Updated notes for source at {source_location} in {repo}." + else: + if update: + return f"No source exists at repo {repo}, location {source_location}" + new_source = Source(repo = repo, source_location = source_location, type = type, notes = notes) + session.add(new_source) + session.commit() + return f"Added new source for {source_location} in {repo}." + + def get_sources(self, repo): + with Session(self.engine) as session: + results = session.query(Source).filter_by(repo=repo).all() + sources = [source_to_dict(source) for source in results] + return sources + + +# our query result format is: "human readable template {val0} {val1},'key0,key1',val0,val1" +def _csv_parse(raw): + results = [] + reader = csv.reader(raw.strip().splitlines()) + try: + for i, row in enumerate(reader): + if i == 0: + continue + # col1 has what we care about, but offer flexibility + keys = row[1].split(',') + this_obj = {'description': row[0].format(*row[2:])} + for j, k in enumerate(keys): + this_obj[k.strip()] = row[j + 2] + results.append(this_obj) + except csv.Error as e: + return ["Error: CSV parsing error: " + str(e)] + return results + + +def _run_query(query_name: str, database_path: str, language: str, template_values: dict): + """Run a CodeQL query and return the results""" + + try: + database_path = _resolve_db_path(database_path) + except RuntimeError: + return f"The database path for {database_path} could not be resolved" + try: + query_path = _resolve_query_path(language, query_name) + except RuntimeError: + return f"The query {query_name} is not supported for language: {language}" + try: + csv = run_query(Path(__file__).parent.resolve() / + query_path, + database_path, + fmt='csv', + template_values=template_values, + log_stderr=True) + return _csv_parse(csv) + except Exception as e: + return f"The query {query_name} encountered an error: {e}" + +def _get_file_contents(db: str | Path, uri: str): + """Retrieve file contents from a CodeQL database""" + db = Path(db) + return file_from_uri(uri, db) + +backend = CodeqlSqliteBackend(MEMORY) + +@mcp.tool() +def remote_sources(owner: str, repo: str, + database_path: str = Field(description="The CodeQL database path."), + language: str = Field(description="The language used for the CodeQL database.")): + """List all remote sources and their locations in a CodeQL database, then store the results in a database.""" + + repo = f"{owner}/{repo}" + results = _run_query('remote_sources', database_path, language, {}) + + # Check if results is an error (list of strings) or valid data (list of dicts) + if results and isinstance(results[0], str): + return f"Error: {results[0]}" + + # Store each result as a source + stored_count = 0 + for result in results: + backend.store_new_source( + repo=repo, + source_location=result.get('location', ''), + type=result.get('source', ''), + notes='', #result.get('description', ''), + update=False + ) + stored_count += 1 + + return f"Stored {stored_count} remote sources in {repo}." + +@mcp.tool() +def fetch_sources(owner: str, repo: str): + """ + Fetch all sources from the repo + """ + repo = f"{owner}/{repo}" + return json.dumps(backend.get_sources(repo)) + +@mcp.tool() +def add_source_notes(owner: str, repo: str, + database_path: str = Field(description="The CodeQL database path."), + source_location: str = Field(description="The path to the file and column info that contains the source"), + notes: str = Field(description="The notes to append to this source", default="")): + """ + Add new notes to an existing source. The notes will be appended to any existing notes. + """ + repo = f"{owner}/{repo}" + try: + database_path = _resolve_db_path(database_path) + except RuntimeError: + return f"The database path for {database_path} could not be resolved" + return backend.store_new_source(repo, source_location, "", notes, update=True) + +@mcp.tool() +def clear_codeql_repo(owner: str, repo: str): + """ + Clear all data for a given repo from the database + """ + repo = f"{owner}/{repo}" + with Session(backend.engine) as session: + deleted_sources = session.query(Source).filter_by(repo=repo).delete() + # deleted_apps = session.query(Application).filter_by(repo=repo).delete() + session.commit() + return f"Cleared {deleted_sources} sources from repo {repo}." + +@mcp.tool() +def get_file_contents( + file_uri: str = Field(description="The file URI to get contents for. The URI scheme is defined as `file://path` and `file://path:region`. Examples of file URI: `file:///path/to/file:1:2:3:4`, `file:///path/to/file`. File URIs optionally contain a region definition that looks like `start_line:start_column:end_line:end_column` which will limit the contents returned to the specified region, for example `file:///path/to/file:1:2:3:4` indicates a file region of `1:2:3:4` which would return the content of the file starting at line 1, column 1 and ending at line 3 column 4. Line and column indices are 1-based, meaning line and column values start at 1. If the region is ommitted the full contents of the file will be returned, for example `file:///path/to/file` returns the full contents of `/path/to/file`."), + database_path: str = Field(description="The path to the CodeQL database.")): + """Get the contents of a file URI from a CodeQL database path.""" + + database_path = _resolve_db_path(database_path) + try: + # fix up any incorrectly formatted relative path uri + if not file_uri.startswith('file:///'): + if file_uri.startswith('file://'): + file_uri = file_uri[len('file://'):] + file_uri = 'file:///' + file_uri.lstrip('/') + results = _get_file_contents(database_path, file_uri) + except Exception as e: + results = f"Error: could not retrieve {file_uri}: {e}" + return results + +@mcp.tool() +def list_source_files(database_path: str = Field(description="The path to the CodeQL database."), + regex_filter: str = Field(description="Optional Regex filter.", default = r'[\s\S]+')): + """List the available source files in a CodeQL database using their file:// URI""" + database_path = _resolve_db_path(database_path) + results = list_src_files(database_path, as_uri=True) + return json.dumps([{'uri': item} for item in results if re.search(regex_filter, item)], indent=2) + +@mcp.tool() +def search_in_source_code(database_path: str = Field(description="The path to the CodeQL database."), + search_term: str = Field(description="The term to search in the source code")): + """ + Search for a string in the source code. Returns the line number and file. + """ + resolved_database_path = _resolve_db_path(database_path) + results = search_in_src_archive(resolved_database_path, search_term) + out = [] + if isinstance(results, dict): + for k,v in results.items(): + out.append({"database" : database_path, "path" : k, "lines" : v}) + return json.dumps(out, indent = 2) + +if __name__ == "__main__": + mcp.run(show_banner=False, transport="http", host="127.0.0.1", port=9998) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.lock.yml b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.lock.yml new file mode 100644 index 0000000..a1dd003 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.lock.yml @@ -0,0 +1,30 @@ +--- +lockVersion: 1.0.0 +dependencies: + codeql/concepts: + version: 0.0.8 + codeql/controlflow: + version: 2.0.18 + codeql/dataflow: + version: 2.0.18 + codeql/mad: + version: 1.0.34 + codeql/python-all: + version: 4.1.0 + codeql/regex: + version: 1.0.34 + codeql/ssa: + version: 2.0.10 + codeql/threat-models: + version: 1.0.34 + codeql/tutorial: + version: 1.0.34 + codeql/typetracking: + version: 2.0.18 + codeql/util: + version: 2.0.21 + codeql/xml: + version: 1.0.34 + codeql/yaml: + version: 1.0.34 +compiled: false diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.yml b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.yml new file mode 100644 index 0000000..b0a2e23 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/codeql-pack.yml @@ -0,0 +1,7 @@ +--- +library: false +warnOnImplicitThis: false +name: sylwia-budzynska/mcp-python +version: 0.0.1 +dependencies: + codeql/python-all: ^4.1.0 \ No newline at end of file diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/example.ql b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/example.ql new file mode 100644 index 0000000..e45af98 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/example.ql @@ -0,0 +1,12 @@ +/** + * This is an automatically generated file + * @name Hello world + * @kind problem + * @problem.severity warning + * @id python/example/hello-world + */ + +import python + +from File f +select f, "Hello, world!" \ No newline at end of file diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql new file mode 100644 index 0000000..3f03461 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql @@ -0,0 +1,17 @@ +/** + * @id mcp-python/remote-sources + * @name Python Remote Sources + * @description Identifies nodes that act as remote sources in Python code, along with their locations. + * @tags source, location + */ +import python +import semmle.python.dataflow.new.RemoteFlowSources + +string normalizeLocation(Location l) { + result = "file://" + "/" + l.getFile().getRelativePath() + ":" + l.getStartLine().toString() + ":" + l.getStartColumn().toString() + + ":" + l.getEndLine().toString() + ":" + l.getEndColumn().toString() +} + +from RemoteFlowSource source +select + "Remote source {0} is defined at {1}", "source,location", source.getSourceType(), normalizeLocation(source.getLocation()) diff --git a/src/seclab_taskflows/toolboxes/codeql_python.yaml b/src/seclab_taskflows/toolboxes/codeql_python.yaml new file mode 100644 index 0000000..a901e08 --- /dev/null +++ b/src/seclab_taskflows/toolboxes/codeql_python.yaml @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + +seclab-taskflow-agent: + version: 1 + filetype: toolbox + +server_params: + kind: streamable + url: 'http://localhost:9998/mcp' + # if you set a command/args/env we will also start the server on demand + command: python + args: ["-m", "seclab_taskflows.mcp_servers.codeql_python.mcp_server"] + env: + CODEQL_DBS_BASE_PATH: "{{ env CODEQL_DBS_BASE_PATH }}" + # prevent git repo operations on gh codeql executions + GH_NO_UPDATE_NOTIFIER: "Disable" + GH_NO_EXTENSION_UPDATE_NOTIFIER: "Disable" + CODEQL_CLI: "{{ env CODEQL_CLI }}" + CODEQL_SQLITE_DIR: "{{ env DATA_DIR }}" +server_prompt: | + ## CodeQL Supported Programming Languages + + CodeQL supports the following languages, which you'll refer to by their + CodeQL acronyms and which are detailed below: + + - actions: GitHub Actions workflows + - cpp: The C and C++ programming language + - csharp: The C# programming language + - go: The Go programming language + - java: The Java programming language (including Kotlin) + - javascript: The JavaScript programming language (including TypeScript) + - python: The Python programming language + - ruby: The Ruby programming language + - rust: The Rust programming language + - swift: The Swift programming language + + When interacting with CodeQL databases, you will need to provide the + appropriate language acronym for the type of project contained within the + CodeQL database. + + For example, when interacting with a CodeQL database for a C based project + you would reference its language as `cpp` for any CodeQL database + interactions. + + If you are unable to determine the appropriate programming language acronym, + halt your task and ask the user to clarify which programming language the + CodeQL database in question was created for. + + + ### CodeQL Database File URI + + The CodeQL database file URI scheme is defined as `file://path` and + `file://path:region`. + + Examples of CodeQL database file URIs: + + - `file:///path/to/file:1:2:3:4` + - `file:///path/to/file` + + File URIs optionally contain a region definition that looks like + `start_line:start_column:end_line:end_column` which will limit the contents + returned to the specified region. For example `file:///path/to/file:1:2:3:4` + indicates a file region of `1:2:3:4` which would return the content of the + file starting at line 1, column 1 and ending at line 3 column 4. These line + and column indices are 1-based, meaning line and column values start at 1. + + If the region is ommitted the full contents of the file will be returned, + for example `file:///path/to/file` returns the full contents of + `/path/to/file`. + + If you want to fetch a specific region by their line numbers only, you can set + the `start_column` and `end_column` values of a region to `0`. For example to + retrieve lines 1-4 from a file at `/path/to/file` you can use a file URI + with a region definition such as: `file:///path/to/file:1:0:4:0`. + + When unsure how to fetch a specific region, fall back to fetching the full file + contents for a file by ommitting the region definition, for example + `file:///path/to/file` From fe60ea11a46d7763a9add018c9ab503d1d08e990 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:47:49 +0000 Subject: [PATCH 03/26] Add auditer --- .../personalities/auditer.yaml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 src/seclab_taskflows/personalities/auditer.yaml diff --git a/src/seclab_taskflows/personalities/auditer.yaml b/src/seclab_taskflows/personalities/auditer.yaml new file mode 100644 index 0000000..33b6fd5 --- /dev/null +++ b/src/seclab_taskflows/personalities/auditer.yaml @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + +seclab-taskflow-agent: + version: 1 + filetype: personality + +personality: | + You are a code security expert. + You have the ability to call tools to aid you in your security reviews. + +task: | + Find and call out security vulnerabilities in any provided code. + Any user prompt that follows this one with more specific instructions + should be followed instead. Try to include line number information for + your findings where possible. + +toolboxes: + - seclab_taskflow_agent.toolboxes.memcache + - seclab_taskflows.toolboxes.codeql_python From 72a154c461729efa7ddd1b2e5f5cf7d25c8a4f3d Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Thu, 27 Nov 2025 12:48:42 +0000 Subject: [PATCH 04/26] Add CodeQL-Python taskflow --- .../taskflows/audit/remote_sources_local.yaml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml new file mode 100644 index 0000000..6cbfba0 --- /dev/null +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + +seclab-taskflow-agent: + filetype: taskflow + version: 1 + +model_config: seclab_taskflows.configs.model_config + +globals: + repo: + apache/allura +# Taskflow to analyze the existing information +taskflow: + - task: + must_complete: true + headless: true + model: general_tasks + agents: + - seclab_taskflow_agent.personalities.assistant + user_prompt: | + Clear the memory cache and clear the codeql_sqlite database for repo {{ GLOBALS_repo }}. + toolboxes: + - seclab_taskflow_agent.toolboxes.memcache + - seclab_taskflows.toolboxes.codeql_python + - task: + model: general_tasks + must_complete: true + headless: true + agents: + - seclab_taskflow_agent.personalities.assistant + user_prompt: | + For the repo {{ GLOBALS_repo }} fetch the Python CodeQL database and find all remote flow sources using CodeQL. + Store the value for CodeQL's 'relative_database_path' in 'codeql_relative_database_path' memory key. + toolboxes: + - seclab_taskflows.toolboxes.gh_code_scanning + - seclab_taskflows.toolboxes.codeql_python + - seclab_taskflow_agent.toolboxes.memcache + - task: + must_complete: true + exclude_from_context: true + model: general_tasks + agents: + - seclab_taskflow_agent.personalities.assistant + user_prompt: | + Fetch the sources from the repo {{ GLOBALS_repo }}. + toolboxes: + - seclab_taskflows.toolboxes.codeql_python + - task: + model: code_analysis + must_complete: false + repeat_prompt: true + async: true + async_limit: 5 + max_steps: 100 + name: source analysis + description: Identify actions that untrusted users are allowed performed the source. + agents: + - seclab_taskflows.personalities.auditer + user_prompt: | + Retrieve the contents of the `codeql_relative_database_path` memory key, which represents the relative path to the CodeQL database for the repository {{ GLOBALS_repo }}. + Using the CodeQL database located at that path, analyze the following source: + The source is a {{ RESULT_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }}. + Analyze what the source endpoint is for and how it is used. + Search for relevant code associated with each source. + If it is a web endpoint, identify the routing path that reaches this source, HTTP method, + any middlewares used, which roles are allowed to call it. + Note which kind of authentication is required for that endpoint. + It is possible that the source does not have require any authentication. + If authorization is required, note the details. + Analyze the code and identify if this source could lead to a security vulnerability. + + Update the source entry in the codeql_sqlite database with your findings. + ## IMPORTANT: General Guidance that ALWAYS applies + + 1. Do NOT ask the user for permission to perform next steps, continue your + analysis autonomously until it is complete. + + 2. Do NOT use 'fetch_sources' or 'remote_sources' tools. + + 3. Do NOT speculate. If you do not have access to the information you need, respond with + the error you encountered. + toolboxes: + - seclab_taskflows.toolboxes.codeql_python + - seclab_taskflow_agent.toolboxes.memcache + - task: + must_complete: true + agents: + - seclab_taskflows.personalities.web_application_security_expert + model: code_analysis + user_prompt: | + Fetch the sources of the repo {{ GLOBALS_repo }} and give a summary of the notes. + toolboxes: + - seclab_taskflows.toolboxes.codeql_python + - seclab_taskflow_agent.toolboxes.memcache From 4af4a6517eed9b46cad3900dc0e0e8499990b6a8 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Thu, 27 Nov 2025 16:20:12 +0100 Subject: [PATCH 05/26] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/codeql_python/README.md | 2 +- .../mcp_servers/codeql_python/codeql_sqlite_models.py | 4 ++-- .../mcp_servers/codeql_python/mcp_server.py | 8 ++++---- .../taskflows/audit/remote_sources_local.yaml | 2 +- src/seclab_taskflows/toolboxes/codeql_python.yaml | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/README.md b/src/seclab_taskflows/mcp_servers/codeql_python/README.md index 2184ce0..7f5a19d 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/README.md +++ b/src/seclab_taskflows/mcp_servers/codeql_python/README.md @@ -17,7 +17,7 @@ It will most likely look similar to this: - create a folder named 'data' - create or update your `.env` file in the root of this project with values for: ``` -COPILOT_TOKEN= # a fine-grained GitHub personal access token with permssion for "copilot chat" +COPILOT_TOKEN= # a fine-grained GitHub personal access token with permission for "copilot chat" CODEQL_DBS_BASE_PATH="/workspaces/seclab-taskflows/data/codeql_databases" #path to folder with your CodeQL databases # Example values for a local setup, run with `python -m seclab_taskflow_agent -t seclab_taskflows.taskflows.audit.remote_sources_local` diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py index 5d8b0e7..a348460 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py @@ -1,8 +1,8 @@ # SPDX-FileCopyrightText: 2025 GitHub # SPDX-License-Identifier: MIT -from sqlalchemy import String, Text, Integer, ForeignKey, Column -from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped, relationship +from sqlalchemy import Text +from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped from typing import Optional class Base(DeclarativeBase): diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 3fd1b95..4a7bb24 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -20,7 +20,7 @@ import json from sqlalchemy import create_engine from sqlalchemy.orm import Session -from pathlib import Path + import zipfile import httpx import aiofiles @@ -88,7 +88,7 @@ def store_new_source(self, repo, source_location, type, notes, update = False): with Session(self.engine) as session: existing = session.query(Source).filter_by(repo = repo, source_location = source_location).first() if existing: - existing.notes += notes + existing.notes = (existing.notes or "") + notes session.commit() return f"Updated notes for source at {source_location} in {repo}." else: @@ -120,7 +120,7 @@ def _csv_parse(raw): for j, k in enumerate(keys): this_obj[k.strip()] = row[j + 2] results.append(this_obj) - except csv.Error as e: + except (csv.Error, IndexError, ValueError) as e: return ["Error: CSV parsing error: " + str(e)] return results @@ -218,7 +218,7 @@ def clear_codeql_repo(owner: str, repo: str): @mcp.tool() def get_file_contents( - file_uri: str = Field(description="The file URI to get contents for. The URI scheme is defined as `file://path` and `file://path:region`. Examples of file URI: `file:///path/to/file:1:2:3:4`, `file:///path/to/file`. File URIs optionally contain a region definition that looks like `start_line:start_column:end_line:end_column` which will limit the contents returned to the specified region, for example `file:///path/to/file:1:2:3:4` indicates a file region of `1:2:3:4` which would return the content of the file starting at line 1, column 1 and ending at line 3 column 4. Line and column indices are 1-based, meaning line and column values start at 1. If the region is ommitted the full contents of the file will be returned, for example `file:///path/to/file` returns the full contents of `/path/to/file`."), + file_uri: str = Field(description="The file URI to get contents for. The URI scheme is defined as `file://path` and `file://path:region`. Examples of file URI: `file:///path/to/file:1:2:3:4`, `file:///path/to/file`. File URIs optionally contain a region definition that looks like `start_line:start_column:end_line:end_column` which will limit the contents returned to the specified region, for example `file:///path/to/file:1:2:3:4` indicates a file region of `1:2:3:4` which would return the content of the file starting at line 1, column 1 and ending at line 3 column 4. Line and column indices are 1-based, meaning line and column values start at 1. If the region is omitted the full contents of the file will be returned, for example `file:///path/to/file` returns the full contents of `/path/to/file`."), database_path: str = Field(description="The path to the CodeQL database.")): """Get the contents of a file URI from a CodeQL database path.""" diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index 6cbfba0..e240e05 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -54,7 +54,7 @@ taskflow: async_limit: 5 max_steps: 100 name: source analysis - description: Identify actions that untrusted users are allowed performed the source. + description: Identify actions that untrusted users are allowed perform the source. agents: - seclab_taskflows.personalities.auditer user_prompt: | diff --git a/src/seclab_taskflows/toolboxes/codeql_python.yaml b/src/seclab_taskflows/toolboxes/codeql_python.yaml index a901e08..2b17f6b 100644 --- a/src/seclab_taskflows/toolboxes/codeql_python.yaml +++ b/src/seclab_taskflows/toolboxes/codeql_python.yaml @@ -62,7 +62,7 @@ server_prompt: | `start_line:start_column:end_line:end_column` which will limit the contents returned to the specified region. For example `file:///path/to/file:1:2:3:4` indicates a file region of `1:2:3:4` which would return the content of the - file starting at line 1, column 1 and ending at line 3 column 4. These line + file starting at line 1, column 2 and ending at line 3, column 4. These line and column indices are 1-based, meaning line and column values start at 1. If the region is ommitted the full contents of the file will be returned, @@ -75,5 +75,5 @@ server_prompt: | with a region definition such as: `file:///path/to/file:1:0:4:0`. When unsure how to fetch a specific region, fall back to fetching the full file - contents for a file by ommitting the region definition, for example + contents for a file by omitting the region definition, for example `file:///path/to/file` From ab3ec175fb2855e51fe3c294e3efc83b805d8a5e Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Fri, 28 Nov 2025 13:04:02 +0100 Subject: [PATCH 06/26] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../mcp_servers/codeql_python/mcp_server.py | 10 ++++++---- .../taskflows/audit/remote_sources_local.yaml | 2 +- src/seclab_taskflows/toolboxes/codeql_python.yaml | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 4a7bb24..f653661 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -13,7 +13,7 @@ from pydantic import Field #from mcp.server.fastmcp import FastMCP, Context -from fastmcp import FastMCP, Context # use FastMCP 2.0 +from fastmcp import FastMCP # use FastMCP 2.0 from pathlib import Path import os import csv @@ -21,9 +21,9 @@ from sqlalchemy import create_engine from sqlalchemy.orm import Session -import zipfile -import httpx -import aiofiles + + + from .codeql_sqlite_models import Base, Source MEMORY = Path(os.getenv('CODEQL_SQLITE_DIR', default='/app/my_data')) @@ -164,6 +164,8 @@ def remote_sources(owner: str, repo: str, results = _run_query('remote_sources', database_path, language, {}) # Check if results is an error (list of strings) or valid data (list of dicts) + if isinstance(results, str): + return f"Error: {results}" if results and isinstance(results[0], str): return f"Error: {results[0]}" diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index e240e05..bb0153a 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -66,7 +66,7 @@ taskflow: If it is a web endpoint, identify the routing path that reaches this source, HTTP method, any middlewares used, which roles are allowed to call it. Note which kind of authentication is required for that endpoint. - It is possible that the source does not have require any authentication. + It is possible that the source does not require any authentication. If authorization is required, note the details. Analyze the code and identify if this source could lead to a security vulnerability. diff --git a/src/seclab_taskflows/toolboxes/codeql_python.yaml b/src/seclab_taskflows/toolboxes/codeql_python.yaml index 2b17f6b..7ffe674 100644 --- a/src/seclab_taskflows/toolboxes/codeql_python.yaml +++ b/src/seclab_taskflows/toolboxes/codeql_python.yaml @@ -65,7 +65,7 @@ server_prompt: | file starting at line 1, column 2 and ending at line 3, column 4. These line and column indices are 1-based, meaning line and column values start at 1. - If the region is ommitted the full contents of the file will be returned, + If the region is omitted the full contents of the file will be returned, for example `file:///path/to/file` returns the full contents of `/path/to/file`. From a864ffa59f77c56732760b6bd93f89b4d02133d4 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 15:12:47 +0000 Subject: [PATCH 07/26] Use local resources in CodeQL for Python MCP --- .../codeql_python/codeql_sqlite_models.py | 1 + .../mcp_servers/codeql_python/mcp_server.py | 65 ++++--------------- .../queries/mcp-python/remote_sources.ql | 14 ++-- .../{auditer.yaml => auditor.yaml} | 0 .../taskflows/audit/remote_sources_local.yaml | 20 ++---- .../toolboxes/codeql_python.yaml | 31 --------- 6 files changed, 29 insertions(+), 102 deletions(-) rename src/seclab_taskflows/personalities/{auditer.yaml => auditor.yaml} (100%) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py index a348460..b2c5273 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py @@ -15,6 +15,7 @@ class Source(Base): id: Mapped[int] = mapped_column(primary_key=True) repo: Mapped[str] source_location: Mapped[str] + line: Mapped[int] type: Mapped[str] notes: Mapped[Optional[str]] = mapped_column(Text, nullable=True) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 4a7bb24..cd6dc92 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -45,6 +45,7 @@ def source_to_dict(result): "source_id": result.id, "repo": result.repo, "source_location": result.source_location, + "line": result.line, "type": result.type, "notes": result.notes } @@ -84,17 +85,17 @@ def __init__(self, memcache_state_dir: str): Base.metadata.create_all(self.engine, tables=[Source.__table__]) - def store_new_source(self, repo, source_location, type, notes, update = False): + def store_new_source(self, repo, source_location, line, type, notes, update = False): with Session(self.engine) as session: - existing = session.query(Source).filter_by(repo = repo, source_location = source_location).first() + existing = session.query(Source).filter_by(repo = repo, source_location = source_location, line = line).first() if existing: existing.notes = (existing.notes or "") + notes session.commit() - return f"Updated notes for source at {source_location} in {repo}." + return f"Updated notes for source at {source_location}, line {line} in {repo}." else: if update: - return f"No source exists at repo {repo}, location {source_location}" - new_source = Source(repo = repo, source_location = source_location, type = type, notes = notes) + return f"No source exists at repo {repo}, location {source_location}, line {line} to update." + new_source = Source(repo = repo, source_location = source_location, line = line, type = type, notes = notes) session.add(new_source) session.commit() return f"Added new source for {source_location} in {repo}." @@ -174,7 +175,8 @@ def remote_sources(owner: str, repo: str, repo=repo, source_location=result.get('location', ''), type=result.get('source', ''), - notes='', #result.get('description', ''), + line=int(result.get('line', '0')), + notes=None, #result.get('description', ''), update=False ) stored_count += 1 @@ -191,18 +193,15 @@ def fetch_sources(owner: str, repo: str): @mcp.tool() def add_source_notes(owner: str, repo: str, - database_path: str = Field(description="The CodeQL database path."), - source_location: str = Field(description="The path to the file and column info that contains the source"), + # database_path: str = Field(description="The CodeQL database path."), + source_location: str = Field(description="The path to the file"), + line: int = Field(description="The line number of the source"), notes: str = Field(description="The notes to append to this source", default="")): """ Add new notes to an existing source. The notes will be appended to any existing notes. """ repo = f"{owner}/{repo}" - try: - database_path = _resolve_db_path(database_path) - except RuntimeError: - return f"The database path for {database_path} could not be resolved" - return backend.store_new_source(repo, source_location, "", notes, update=True) + return backend.store_new_source(repo = repo, source_location = source_location, line = line, type = "", notes = notes, update=True) @mcp.tool() def clear_codeql_repo(owner: str, repo: str): @@ -216,45 +215,5 @@ def clear_codeql_repo(owner: str, repo: str): session.commit() return f"Cleared {deleted_sources} sources from repo {repo}." -@mcp.tool() -def get_file_contents( - file_uri: str = Field(description="The file URI to get contents for. The URI scheme is defined as `file://path` and `file://path:region`. Examples of file URI: `file:///path/to/file:1:2:3:4`, `file:///path/to/file`. File URIs optionally contain a region definition that looks like `start_line:start_column:end_line:end_column` which will limit the contents returned to the specified region, for example `file:///path/to/file:1:2:3:4` indicates a file region of `1:2:3:4` which would return the content of the file starting at line 1, column 1 and ending at line 3 column 4. Line and column indices are 1-based, meaning line and column values start at 1. If the region is omitted the full contents of the file will be returned, for example `file:///path/to/file` returns the full contents of `/path/to/file`."), - database_path: str = Field(description="The path to the CodeQL database.")): - """Get the contents of a file URI from a CodeQL database path.""" - - database_path = _resolve_db_path(database_path) - try: - # fix up any incorrectly formatted relative path uri - if not file_uri.startswith('file:///'): - if file_uri.startswith('file://'): - file_uri = file_uri[len('file://'):] - file_uri = 'file:///' + file_uri.lstrip('/') - results = _get_file_contents(database_path, file_uri) - except Exception as e: - results = f"Error: could not retrieve {file_uri}: {e}" - return results - -@mcp.tool() -def list_source_files(database_path: str = Field(description="The path to the CodeQL database."), - regex_filter: str = Field(description="Optional Regex filter.", default = r'[\s\S]+')): - """List the available source files in a CodeQL database using their file:// URI""" - database_path = _resolve_db_path(database_path) - results = list_src_files(database_path, as_uri=True) - return json.dumps([{'uri': item} for item in results if re.search(regex_filter, item)], indent=2) - -@mcp.tool() -def search_in_source_code(database_path: str = Field(description="The path to the CodeQL database."), - search_term: str = Field(description="The term to search in the source code")): - """ - Search for a string in the source code. Returns the line number and file. - """ - resolved_database_path = _resolve_db_path(database_path) - results = search_in_src_archive(resolved_database_path, search_term) - out = [] - if isinstance(results, dict): - for k,v in results.items(): - out.append({"database" : database_path, "path" : k, "lines" : v}) - return json.dumps(out, indent = 2) - if __name__ == "__main__": mcp.run(show_banner=False, transport="http", host="127.0.0.1", port=9998) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql index 3f03461..32ea16c 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql +++ b/src/seclab_taskflows/mcp_servers/codeql_python/queries/mcp-python/remote_sources.ql @@ -7,11 +7,15 @@ import python import semmle.python.dataflow.new.RemoteFlowSources -string normalizeLocation(Location l) { - result = "file://" + "/" + l.getFile().getRelativePath() + ":" + l.getStartLine().toString() + ":" + l.getStartColumn().toString() - + ":" + l.getEndLine().toString() + ":" + l.getEndColumn().toString() -} +// string normalizeLocation(Location l) { +// result = l.getFile().getRelativePath() + ":" + l.getStartLine().toString() + ":" + l.getStartColumn().toString() +// + ":" + l.getEndLine().toString() + ":" + l.getEndColumn().toString() +// } from RemoteFlowSource source select - "Remote source {0} is defined at {1}", "source,location", source.getSourceType(), normalizeLocation(source.getLocation()) + "Remote source {0} is defined at {1} line {2}", + "source,location,line", + source.getSourceType(), + source.getLocation().getFile().getRelativePath(), + source.getLocation().getStartLine().toString() diff --git a/src/seclab_taskflows/personalities/auditer.yaml b/src/seclab_taskflows/personalities/auditor.yaml similarity index 100% rename from src/seclab_taskflows/personalities/auditer.yaml rename to src/seclab_taskflows/personalities/auditor.yaml diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index e240e05..5d8820e 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -9,7 +9,6 @@ model_config: seclab_taskflows.configs.model_config globals: repo: - apache/allura # Taskflow to analyze the existing information taskflow: - task: @@ -31,11 +30,9 @@ taskflow: - seclab_taskflow_agent.personalities.assistant user_prompt: | For the repo {{ GLOBALS_repo }} fetch the Python CodeQL database and find all remote flow sources using CodeQL. - Store the value for CodeQL's 'relative_database_path' in 'codeql_relative_database_path' memory key. toolboxes: - seclab_taskflows.toolboxes.gh_code_scanning - seclab_taskflows.toolboxes.codeql_python - - seclab_taskflow_agent.toolboxes.memcache - task: must_complete: true exclude_from_context: true @@ -50,25 +47,21 @@ taskflow: model: code_analysis must_complete: false repeat_prompt: true - async: true - async_limit: 5 max_steps: 100 name: source analysis description: Identify actions that untrusted users are allowed perform the source. agents: - - seclab_taskflows.personalities.auditer + - seclab_taskflows.personalities.auditor user_prompt: | - Retrieve the contents of the `codeql_relative_database_path` memory key, which represents the relative path to the CodeQL database for the repository {{ GLOBALS_repo }}. - Using the CodeQL database located at that path, analyze the following source: - The source is a {{ RESULT_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }}. - Analyze what the source endpoint is for and how it is used. - Search for relevant code associated with each source. + Fetch the zipball of the repository {{ GLOBALS_repo }} and use it to analyze the source. + The source is a {{ RESULT_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }} on line {{ RESULT_line }}. + If the source is in a folder relating to tests or demo code, skip the analysis and update the source entry in the codeql_sqlite database indicating it is not relevant. + Analyze what the source endpoint is used for. If it is a web endpoint, identify the routing path that reaches this source, HTTP method, any middlewares used, which roles are allowed to call it. Note which kind of authentication is required for that endpoint. It is possible that the source does not have require any authentication. If authorization is required, note the details. - Analyze the code and identify if this source could lead to a security vulnerability. Update the source entry in the codeql_sqlite database with your findings. ## IMPORTANT: General Guidance that ALWAYS applies @@ -82,7 +75,8 @@ taskflow: the error you encountered. toolboxes: - seclab_taskflows.toolboxes.codeql_python - - seclab_taskflow_agent.toolboxes.memcache + - seclab_taskflows.toolboxes.local_gh_resources + - seclab_taskflows.toolboxes.local_file_viewer - task: must_complete: true agents: diff --git a/src/seclab_taskflows/toolboxes/codeql_python.yaml b/src/seclab_taskflows/toolboxes/codeql_python.yaml index 2b17f6b..a838763 100644 --- a/src/seclab_taskflows/toolboxes/codeql_python.yaml +++ b/src/seclab_taskflows/toolboxes/codeql_python.yaml @@ -46,34 +46,3 @@ server_prompt: | If you are unable to determine the appropriate programming language acronym, halt your task and ask the user to clarify which programming language the CodeQL database in question was created for. - - - ### CodeQL Database File URI - - The CodeQL database file URI scheme is defined as `file://path` and - `file://path:region`. - - Examples of CodeQL database file URIs: - - - `file:///path/to/file:1:2:3:4` - - `file:///path/to/file` - - File URIs optionally contain a region definition that looks like - `start_line:start_column:end_line:end_column` which will limit the contents - returned to the specified region. For example `file:///path/to/file:1:2:3:4` - indicates a file region of `1:2:3:4` which would return the content of the - file starting at line 1, column 2 and ending at line 3, column 4. These line - and column indices are 1-based, meaning line and column values start at 1. - - If the region is ommitted the full contents of the file will be returned, - for example `file:///path/to/file` returns the full contents of - `/path/to/file`. - - If you want to fetch a specific region by their line numbers only, you can set - the `start_column` and `end_column` values of a region to `0`. For example to - retrieve lines 1-4 from a file at `/path/to/file` you can use a file URI - with a region definition such as: `file:///path/to/file:1:0:4:0`. - - When unsure how to fetch a specific region, fall back to fetching the full file - contents for a file by omitting the region definition, for example - `file:///path/to/file` From 8c2f42f583b2459eeb6f5270042c7598b196c7ae Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 19:25:20 +0000 Subject: [PATCH 08/26] Refactor process_repo into utils --- .../mcp_servers/codeql_python/mcp_server.py | 21 ++++------ .../mcp_servers/repo_context.py | 42 +++++++++---------- src/seclab_taskflows/mcp_servers/utils.py | 2 + 3 files changed, 30 insertions(+), 35 deletions(-) create mode 100644 src/seclab_taskflows/mcp_servers/utils.py diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index e389f3b..aab45e4 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -21,10 +21,8 @@ from sqlalchemy import create_engine from sqlalchemy.orm import Session - - - from .codeql_sqlite_models import Base, Source +from .utils import process_repo MEMORY = Path(os.getenv('CODEQL_SQLITE_DIR', default='/app/my_data')) mcp = FastMCP("CodeQL-Python") @@ -66,7 +64,9 @@ def _resolve_db_path(relative_db_path: str | Path): # not windows compatible and probably needs additional hardening relative_db_path = str(relative_db_path).strip().lstrip('/') relative_db_path = Path(relative_db_path) - absolute_path = CODEQL_DBS_BASE_PATH / relative_db_path + absolute_path = (CODEQL_DBS_BASE_PATH / relative_db_path).resolve() + if not str(absolute_path).startswith(str(CODEQL_DBS_BASE_PATH.resolve())): + raise RuntimeError(f"Error: Database path {absolute_path} is outside the base path {CODEQL_DBS_BASE_PATH}") if not absolute_path.is_dir(): _debug_log(f"Database path not found: {absolute_path}") raise RuntimeError(f"Error: Database not found at {absolute_path}!") @@ -148,11 +148,6 @@ def _run_query(query_name: str, database_path: str, language: str, template_valu except Exception as e: return f"The query {query_name} encountered an error: {e}" -def _get_file_contents(db: str | Path, uri: str): - """Retrieve file contents from a CodeQL database""" - db = Path(db) - return file_from_uri(uri, db) - backend = CodeqlSqliteBackend(MEMORY) @mcp.tool() @@ -161,7 +156,7 @@ def remote_sources(owner: str, repo: str, language: str = Field(description="The language used for the CodeQL database.")): """List all remote sources and their locations in a CodeQL database, then store the results in a database.""" - repo = f"{owner}/{repo}" + repo = process_repo(owner, repo) results = _run_query('remote_sources', database_path, language, {}) # Check if results is an error (list of strings) or valid data (list of dicts) @@ -190,7 +185,7 @@ def fetch_sources(owner: str, repo: str): """ Fetch all sources from the repo """ - repo = f"{owner}/{repo}" + repo = process_repo(owner, repo) return json.dumps(backend.get_sources(repo)) @mcp.tool() @@ -202,7 +197,7 @@ def add_source_notes(owner: str, repo: str, """ Add new notes to an existing source. The notes will be appended to any existing notes. """ - repo = f"{owner}/{repo}" + repo = process_repo(owner, repo) return backend.store_new_source(repo = repo, source_location = source_location, line = line, type = "", notes = notes, update=True) @mcp.tool() @@ -210,7 +205,7 @@ def clear_codeql_repo(owner: str, repo: str): """ Clear all data for a given repo from the database """ - repo = f"{owner}/{repo}" + repo = process_repo(owner, repo) with Session(backend.engine) as session: deleted_sources = session.query(Source).filter_by(repo=repo).delete() # deleted_apps = session.query(Application).filter_by(repo=repo).delete() diff --git a/src/seclab_taskflows/mcp_servers/repo_context.py b/src/seclab_taskflows/mcp_servers/repo_context.py index d4c5917..5bf20dc 100644 --- a/src/seclab_taskflows/mcp_servers/repo_context.py +++ b/src/seclab_taskflows/mcp_servers/repo_context.py @@ -21,6 +21,7 @@ from pathlib import Path from .repo_context_models import Application, EntryPoint, UserAction, WebEntryPoint, ApplicationIssue, AuditResult, Base +from .utils import process_repo MEMORY = Path(os.getenv('REPO_CONTEXT_DIR', default='/app/my_data')) @@ -90,9 +91,9 @@ def __init__(self, memcache_state_dir: str): else: db_dir = f'sqlite:///{self.memcache_state_dir}/repo_context.db' self.engine = create_engine(db_dir, echo=False) - Base.metadata.create_all(self.engine, tables=[Application.__table__, EntryPoint.__table__, UserAction.__table__, + Base.metadata.create_all(self.engine, tables=[Application.__table__, EntryPoint.__table__, UserAction.__table__, WebEntryPoint.__table__, ApplicationIssue.__table__, AuditResult.__table__]) - + def store_new_application(self, repo, location, is_app, is_library, notes): with Session(self.engine) as session: existing = session.query(Application).filter_by(repo = repo, location = location).first() @@ -107,7 +108,7 @@ def store_new_application(self, repo, location, is_app, is_library, notes): session.add(new_application) session.commit() return f"Updated or added application for {location} in {repo}." - + def store_new_component_issue(self, repo, component_id, issue_type, notes): with Session(self.engine) as session: existing = session.query(ApplicationIssue).filter_by(repo = repo, component_id = component_id, issue_type = issue_type).first() @@ -155,7 +156,7 @@ def store_new_entry_point(self, repo, app_id, file, user_input, line, notes, upd session.add(new_entry_point) session.commit() return f"Updated or added entry point for {file} and {line} in {repo}." - + def store_new_web_entry_point(self, repo, entry_point_id, method, path, component, auth, middleware, roles_scopes, notes, update = False): with Session(self.engine) as session: existing = session.query(WebEntryPoint).filter_by(repo = repo, entry_point_id = entry_point_id).first() @@ -177,7 +178,7 @@ def store_new_web_entry_point(self, repo, entry_point_id, method, path, componen if update: return f"No web entry point exists at repo {repo} with entry_point_id {entry_point_id}." new_web_entry_point = WebEntryPoint( - repo = repo, + repo = repo, entry_point_id = entry_point_id, method = method, path = path, @@ -190,7 +191,7 @@ def store_new_web_entry_point(self, repo, entry_point_id, method, path, componen session.add(new_web_entry_point) session.commit() return f"Updated or added web entry point for entry_point_id {entry_point_id} in {repo}." - + def store_new_user_action(self, repo, app_id, file, line, notes, update = False): with Session(self.engine) as session: existing = session.query(UserAction).filter_by(repo = repo, file = file, line = line).first() @@ -203,7 +204,7 @@ def store_new_user_action(self, repo, app_id, file, line, notes, update = False) session.add(new_user_action) session.commit() return f"Updated or added user action for {file} and {line} in {repo}." - + def get_app(self, repo, location): with Session(self.engine) as session: existing = session.query(Application).filter_by(repo = repo, location = location).first() @@ -271,7 +272,7 @@ def get_web_entries_for_repo(self, repo): with Session(self.engine) as session: results = session.query(WebEntryPoint).filter_by(repo = repo).all() return [{ - 'repo' : r.repo, + 'repo' : r.repo, 'entry_point_id' : r.entry_point_id, 'method' : r.method, 'path' : r.path, @@ -286,7 +287,7 @@ def get_web_entries(self, repo, component_id): with Session(self.engine) as session: results = session.query(WebEntryPoint).filter_by(repo = repo, component = component_id).all() return [{ - 'repo' : r.repo, + 'repo' : r.repo, 'entry_point_id' : r.entry_point_id, 'method' : r.method, 'path' : r.path, @@ -313,7 +314,7 @@ def get_user_actions_for_repo(self, repo): ).filter(UserAction.app_id == Application.id).all() uas = [user_action_to_dict(ua) for app, ua in results] return uas - + def clear_repo(self, repo): with Session(self.engine) as session: session.query(Application).filter_by(repo = repo).delete() @@ -324,7 +325,7 @@ def clear_repo(self, repo): session.query(AuditResult).filter_by(repo = repo).delete() session.commit() return f"Cleared results for repo {repo}" - + def clear_repo_issues(self, repo): with Session(self.engine) as session: session.query(ApplicationIssue).filter_by(repo = repo).delete() @@ -336,13 +337,10 @@ def clear_repo_issues(self, repo): backend = RepoContextBackend(MEMORY) -def process_repo(owner, repo): - return f"{owner}/{repo}".lower() - @mcp.tool() -def store_new_component(owner: str, repo: str, location: str = Field(description="The directory of the component"), - is_app: bool = Field(description="Is this an application", default=None), - is_library: bool = Field(description="Is this a library", default=None), +def store_new_component(owner: str, repo: str, location: str = Field(description="The directory of the component"), + is_app: bool = Field(description="Is this an application", default=None), + is_library: bool = Field(description="Is this a library", default=None), notes: str = Field(description="The notes taken for this component", default="")): """ Stores a new component in the database. @@ -386,9 +384,9 @@ def store_new_component_issue(owner: str, repo: str, component_id: int, return backend.store_new_component_issue(repo, component_id, issue_type, notes) @mcp.tool() -def store_new_audit_result(owner: str, repo: str, component_id: int, issue_type: str, issue_id: int, - has_non_security_error: bool = Field(description="Set to true if there are security issues or logic error but may not be exploitable"), - has_vulnerability: bool = Field(description="Set to true if a security vulnerability is identified"), +def store_new_audit_result(owner: str, repo: str, component_id: int, issue_type: str, issue_id: int, + has_non_security_error: bool = Field(description="Set to true if there are security issues or logic error but may not be exploitable"), + has_vulnerability: bool = Field(description="Set to true if a security vulnerability is identified"), notes: str = Field(description="The notes for the audit of this issue")): """ Stores the audit result for issue with issue_id. @@ -397,7 +395,7 @@ def store_new_audit_result(owner: str, repo: str, component_id: int, issue_type: return backend.store_new_audit_result(repo, component_id, issue_type, issue_id, has_non_security_error, has_vulnerability, notes) @mcp.tool() -def store_new_web_entry_point(owner: str, repo: str, +def store_new_web_entry_point(owner: str, repo: str, entry_point_id: int = Field(description="The ID of the entry point this web entry point refers to"), location: str = Field(description="The directory of the component where the web entry point belongs to"), method: str = Field(description="HTTP method (GET, POST, etc)", default=""), @@ -432,7 +430,7 @@ def add_entry_point_notes(owner: str, repo: str, @mcp.tool() def store_new_user_action(owner: str, repo: str, location: str = Field(description="The directory of the component where the user action belonged to"), file: str = Field(description="The file that contains the user action"), - line: int = Field(description="The file line that contains the user action"), + line: int = Field(description="The file line that contains the user action"), notes: str = Field(description="New notes for this user action", default = "")): """ Stores a new user action in a component to the database. diff --git a/src/seclab_taskflows/mcp_servers/utils.py b/src/seclab_taskflows/mcp_servers/utils.py new file mode 100644 index 0000000..adec991 --- /dev/null +++ b/src/seclab_taskflows/mcp_servers/utils.py @@ -0,0 +1,2 @@ +def process_repo(owner, repo): + return f"{owner}/{repo}".lower() From 341fadd12f9be17049f30f43408138a55233bd49 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 20:24:17 +0000 Subject: [PATCH 09/26] Fix import --- src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index aab45e4..f938df0 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -22,7 +22,7 @@ from sqlalchemy.orm import Session from .codeql_sqlite_models import Base, Source -from .utils import process_repo +from ..utils import process_repo MEMORY = Path(os.getenv('CODEQL_SQLITE_DIR', default='/app/my_data')) mcp = FastMCP("CodeQL-Python") From 5be484894f282d1f15aebae56fb346734758d226 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 20:30:19 +0000 Subject: [PATCH 10/26] Rename Source type field to source_type --- .../codeql_python/codeql_sqlite_models.py | 2 +- .../mcp_servers/codeql_python/mcp_server.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py index b2c5273..07bee1c 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py @@ -16,7 +16,7 @@ class Source(Base): repo: Mapped[str] source_location: Mapped[str] line: Mapped[int] - type: Mapped[str] + source_type: Mapped[str] notes: Mapped[Optional[str]] = mapped_column(Text, nullable=True) def __repr__(self): diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index f938df0..74dfb9a 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -44,7 +44,7 @@ def source_to_dict(result): "repo": result.repo, "source_location": result.source_location, "line": result.line, - "type": result.type, + "source_type": result.source_type, "notes": result.notes } @@ -76,7 +76,6 @@ def _resolve_db_path(relative_db_path: str | Path): class CodeqlSqliteBackend: def __init__(self, memcache_state_dir: str): self.memcache_state_dir = memcache_state_dir - self.location_pattern = r'^([a-zA-Z]+)(:\d+){4}$' if not Path(self.memcache_state_dir).exists(): db_dir = 'sqlite://' else: @@ -85,7 +84,7 @@ def __init__(self, memcache_state_dir: str): Base.metadata.create_all(self.engine, tables=[Source.__table__]) - def store_new_source(self, repo, source_location, line, type, notes, update = False): + def store_new_source(self, repo, source_location, line, source_type, notes, update = False): with Session(self.engine) as session: existing = session.query(Source).filter_by(repo = repo, source_location = source_location, line = line).first() if existing: @@ -95,7 +94,7 @@ def store_new_source(self, repo, source_location, line, type, notes, update = Fa else: if update: return f"No source exists at repo {repo}, location {source_location}, line {line} to update." - new_source = Source(repo = repo, source_location = source_location, line = line, type = type, notes = notes) + new_source = Source(repo = repo, source_location = source_location, line = line, source_type = source_type, notes = notes) session.add(new_source) session.commit() return f"Added new source for {source_location} in {repo}." @@ -171,7 +170,7 @@ def remote_sources(owner: str, repo: str, backend.store_new_source( repo=repo, source_location=result.get('location', ''), - type=result.get('source', ''), + source_type=result.get('source', ''), line=int(result.get('line', '0')), notes=None, #result.get('description', ''), update=False @@ -198,7 +197,7 @@ def add_source_notes(owner: str, repo: str, Add new notes to an existing source. The notes will be appended to any existing notes. """ repo = process_repo(owner, repo) - return backend.store_new_source(repo = repo, source_location = source_location, line = line, type = "", notes = notes, update=True) + return backend.store_new_source(repo = repo, source_location = source_location, line = line, source_type = "", notes = notes, update=True) @mcp.tool() def clear_codeql_repo(owner: str, repo: str): @@ -208,7 +207,6 @@ def clear_codeql_repo(owner: str, repo: str): repo = process_repo(owner, repo) with Session(backend.engine) as session: deleted_sources = session.query(Source).filter_by(repo=repo).delete() - # deleted_apps = session.query(Application).filter_by(repo=repo).delete() session.commit() return f"Cleared {deleted_sources} sources from repo {repo}." From 6705d604d4ddfe1ece15ff7e5e24d553c9ab43e6 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 21:51:14 +0100 Subject: [PATCH 11/26] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 74dfb9a..cfcdf0e 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -101,7 +101,7 @@ def store_new_source(self, repo, source_location, line, source_type, notes, upda def get_sources(self, repo): with Session(self.engine) as session: - results = session.query(Source).filter_by(repo=repo).all() + results = session.query(Source).filter_by(repo = repo).all() sources = [source_to_dict(source) for source in results] return sources @@ -206,7 +206,7 @@ def clear_codeql_repo(owner: str, repo: str): """ repo = process_repo(owner, repo) with Session(backend.engine) as session: - deleted_sources = session.query(Source).filter_by(repo=repo).delete() + deleted_sources = session.query(Source).filter_by(repo = repo).delete() session.commit() return f"Cleared {deleted_sources} sources from repo {repo}." From 51b91b92ad5f51b4bfef8c819f43b62d7eec2b00 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:04:13 +0100 Subject: [PATCH 12/26] Update src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index 3a78caa..be01b2e 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -49,7 +49,7 @@ taskflow: repeat_prompt: true max_steps: 100 name: source analysis - description: Identify actions that untrusted users are allowed perform the source. + description: Identify actions that untrusted users are allowed to perform on the source. agents: - seclab_taskflows.personalities.auditor user_prompt: | From 3d1fd19b40a740de1d1e68c28720b724e3846d67 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:04:24 +0100 Subject: [PATCH 13/26] Update src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index cfcdf0e..851544c 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -9,7 +9,7 @@ filename='logs/mcp_codeql_python.log', filemode='a' ) -from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, file_from_uri, list_src_files, _debug_log, search_in_src_archive +from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, _debug_log from pydantic import Field #from mcp.server.fastmcp import FastMCP, Context From d274b9911590e8f6c29e3d812517cabf591a914a Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:19:55 +0100 Subject: [PATCH 14/26] Update src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../mcp_servers/codeql_python/codeql_sqlite_models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py index 07bee1c..51d1224 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/codeql_sqlite_models.py @@ -21,6 +21,5 @@ class Source(Base): def __repr__(self): return (f"") From a3261aaf92b5487988bd20705020b729f5e497ef Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Mon, 1 Dec 2025 22:20:08 +0100 Subject: [PATCH 15/26] Update src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index be01b2e..432cb5c 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -54,7 +54,7 @@ taskflow: - seclab_taskflows.personalities.auditor user_prompt: | Fetch the zipball of the repository {{ GLOBALS_repo }} and use it to analyze the source. - The source is a {{ RESULT_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }} on line {{ RESULT_line }}. + The source is a {{ RESULT_source_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }} on line {{ RESULT_line }}. If the source is in a folder relating to tests or demo code, skip the analysis and update the source entry in the codeql_sqlite database indicating it is not relevant. Analyze what the source endpoint is used for. If it is a web endpoint, identify the routing path that reaches this source, HTTP method, From 2b50b82b38f66c9c285832e2292afdad0c303d16 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 10:10:39 +0000 Subject: [PATCH 16/26] Automatically install Python QL pack --- .../mcp_servers/codeql_python/mcp_server.py | 7 +++++++ .../taskflows/audit/remote_sources_local.yaml | 14 +++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 74dfb9a..5d4d172 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -20,6 +20,8 @@ import json from sqlalchemy import create_engine from sqlalchemy.orm import Session +import subprocess +import importlib.resources from .codeql_sqlite_models import Base, Source from ..utils import process_repo @@ -211,4 +213,9 @@ def clear_codeql_repo(owner: str, repo: str): return f"Cleared {deleted_sources} sources from repo {repo}." if __name__ == "__main__": + # Check if codeql/python-all pack is installed, if not install it + if not os.path.isdir('/.codeql/packages/codeql/python-all'): + pack_path = importlib.resources.files('seclab_taskflows.mcp_servers.codeql_python.queries').joinpath('mcp-python') + print(f"Installing CodeQL pack from {pack_path}") + subprocess.run(["codeql", "pack", "install", pack_path]) mcp.run(show_banner=False, transport="http", host="127.0.0.1", port=9998) diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index 3a78caa..25658a2 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -33,6 +33,16 @@ taskflow: toolboxes: - seclab_taskflows.toolboxes.gh_code_scanning - seclab_taskflows.toolboxes.codeql_python + - task: + must_complete: true + exclude_from_context: true + model: general_tasks + agents: + - seclab_taskflow_agent.personalities.assistant + user_prompt: | + Fetch the zipball of the repository {{ GLOBALS_repo }}. + toolboxes: + - seclab_taskflows.toolboxes.local_gh_resources - task: must_complete: true exclude_from_context: true @@ -53,7 +63,6 @@ taskflow: agents: - seclab_taskflows.personalities.auditor user_prompt: | - Fetch the zipball of the repository {{ GLOBALS_repo }} and use it to analyze the source. The source is a {{ RESULT_type }} in {{ RESULT_repo }} in the location {{ RESULT_source_location }} on line {{ RESULT_line }}. If the source is in a folder relating to tests or demo code, skip the analysis and update the source entry in the codeql_sqlite database indicating it is not relevant. Analyze what the source endpoint is used for. @@ -75,12 +84,11 @@ taskflow: the error you encountered. toolboxes: - seclab_taskflows.toolboxes.codeql_python - - seclab_taskflows.toolboxes.local_gh_resources - seclab_taskflows.toolboxes.local_file_viewer - task: must_complete: true agents: - - seclab_taskflows.personalities.web_application_security_expert + - seclab_taskflows.personalities.auditor model: code_analysis user_prompt: | Fetch the sources of the repo {{ GLOBALS_repo }} and give a summary of the notes. From 29eb2219b3b1153e3786eb78266b003d9ee44483 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:52:54 +0000 Subject: [PATCH 17/26] Use generated data dirs --- .gitignore | 2 +- .../mcp_servers/codeql_python/mcp_server.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index bd801df..3d413d4 100644 --- a/.gitignore +++ b/.gitignore @@ -134,7 +134,7 @@ celerybeat.pid *.sage.py # Environments -*.env +.env .venv env/ venv/ diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 7bf0422..5c967a3 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -10,6 +10,7 @@ filemode='a' ) from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, _debug_log +from seclab_taskflow_agent.path_utils import mcp_data_dir from pydantic import Field #from mcp.server.fastmcp import FastMCP, Context @@ -26,10 +27,10 @@ from .codeql_sqlite_models import Base, Source from ..utils import process_repo -MEMORY = Path(os.getenv('CODEQL_SQLITE_DIR', default='/app/my_data')) -mcp = FastMCP("CodeQL-Python") +MEMORY = mcp_data_dir('seclab-taskflow-agent', 'codeql', 'DATA_DIR') +CODEQL_DBS_BASE_PATH = mcp_data_dir('seclab-taskflow-agent', 'codeql', 'CODEQL_DBS_BASE_PATH') -CODEQL_DBS_BASE_PATH = Path(os.getenv('CODEQL_DBS_BASE_PATH', default='/workspaces/seclab-taskflow-agent/my_data')) +mcp = FastMCP("CodeQL-Python") # tool name -> templated query lookup for supported languages TEMPLATED_QUERY_PATHS = { From eb5a0ffc37a325a2e646b114ff026c31d48a74b3 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:54:21 +0000 Subject: [PATCH 18/26] Use DATA_DIR path for storing sqlite db --- src/seclab_taskflows/toolboxes/codeql_python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/toolboxes/codeql_python.yaml b/src/seclab_taskflows/toolboxes/codeql_python.yaml index a838763..b2525c8 100644 --- a/src/seclab_taskflows/toolboxes/codeql_python.yaml +++ b/src/seclab_taskflows/toolboxes/codeql_python.yaml @@ -17,7 +17,7 @@ server_params: GH_NO_UPDATE_NOTIFIER: "Disable" GH_NO_EXTENSION_UPDATE_NOTIFIER: "Disable" CODEQL_CLI: "{{ env CODEQL_CLI }}" - CODEQL_SQLITE_DIR: "{{ env DATA_DIR }}" + DATA_DIR: "{{ env DATA_DIR }}" server_prompt: | ## CodeQL Supported Programming Languages From 938506382c7d9138ea8566a07a20566a8fc5da20 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 11:56:12 +0000 Subject: [PATCH 19/26] Add Fields to MCP tool params --- .../mcp_servers/codeql_python/mcp_server.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 5c967a3..868af1e 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -153,7 +153,8 @@ def _run_query(query_name: str, database_path: str, language: str, template_valu backend = CodeqlSqliteBackend(MEMORY) @mcp.tool() -def remote_sources(owner: str, repo: str, +def remote_sources(owner: str = Field(description="The owner of the GitHub repository", default=""), + repo: str = Field(description="The name of the GitHub repository", default=""), database_path: str = Field(description="The CodeQL database path."), language: str = Field(description="The language used for the CodeQL database.")): """List all remote sources and their locations in a CodeQL database, then store the results in a database.""" @@ -183,7 +184,7 @@ def remote_sources(owner: str, repo: str, return f"Stored {stored_count} remote sources in {repo}." @mcp.tool() -def fetch_sources(owner: str, repo: str): +def fetch_sources(owner: str = Field(description="The owner of the GitHub repository", default=""), repo: str = Field(description="The name of the GitHub repository", default="")): """ Fetch all sources from the repo """ @@ -191,8 +192,8 @@ def fetch_sources(owner: str, repo: str): return json.dumps(backend.get_sources(repo)) @mcp.tool() -def add_source_notes(owner: str, repo: str, - # database_path: str = Field(description="The CodeQL database path."), +def add_source_notes(owner: str = Field(description="The owner of the GitHub repository", default=""), + repo: str = Field(description="The name of the GitHub repository", default=""), source_location: str = Field(description="The path to the file"), line: int = Field(description="The line number of the source"), notes: str = Field(description="The notes to append to this source", default="")): @@ -203,7 +204,7 @@ def add_source_notes(owner: str, repo: str, return backend.store_new_source(repo = repo, source_location = source_location, line = line, source_type = "", notes = notes, update=True) @mcp.tool() -def clear_codeql_repo(owner: str, repo: str): +def clear_codeql_repo(owner: str = Field(description="The owner of the GitHub repository", default=""), repo: str = Field(description="The name of the GitHub repository", default="")): """ Clear all data for a given repo from the database """ From 3ad757d4eaae8ee4f0835adf4422a5bc0f5d57e9 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 12:34:11 +0000 Subject: [PATCH 20/26] Fix env paths handling --- .../mcp_servers/codeql_python/mcp_server.py | 24 +++++++++++-------- .../taskflows/audit/remote_sources_local.yaml | 1 - 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 868af1e..f152a08 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -10,7 +10,7 @@ filemode='a' ) from seclab_taskflow_agent.mcp_servers.codeql.client import run_query, _debug_log -from seclab_taskflow_agent.path_utils import mcp_data_dir +# from seclab_taskflow_agent.path_utils import mcp_data_dir from pydantic import Field #from mcp.server.fastmcp import FastMCP, Context @@ -27,8 +27,10 @@ from .codeql_sqlite_models import Base, Source from ..utils import process_repo -MEMORY = mcp_data_dir('seclab-taskflow-agent', 'codeql', 'DATA_DIR') -CODEQL_DBS_BASE_PATH = mcp_data_dir('seclab-taskflow-agent', 'codeql', 'CODEQL_DBS_BASE_PATH') +MEMORY = Path(os.getenv('DATA_DIR', default='/app/data')) +CODEQL_DBS_BASE_PATH = Path(os.getenv('CODEQL_DBS_BASE_PATH', default='/app/data')) +# MEMORY = mcp_data_dir('seclab-taskflows', 'codeql', 'DATA_DIR') +# CODEQL_DBS_BASE_PATH = mcp_data_dir('seclab-taskflows', 'codeql', 'CODEQL_DBS_BASE_PATH') mcp = FastMCP("CodeQL-Python") @@ -153,8 +155,8 @@ def _run_query(query_name: str, database_path: str, language: str, template_valu backend = CodeqlSqliteBackend(MEMORY) @mcp.tool() -def remote_sources(owner: str = Field(description="The owner of the GitHub repository", default=""), - repo: str = Field(description="The name of the GitHub repository", default=""), +def remote_sources(owner: str = Field(description="The owner of the GitHub repository"), + repo: str = Field(description="The name of the GitHub repository"), database_path: str = Field(description="The CodeQL database path."), language: str = Field(description="The language used for the CodeQL database.")): """List all remote sources and their locations in a CodeQL database, then store the results in a database.""" @@ -184,7 +186,8 @@ def remote_sources(owner: str = Field(description="The owner of the GitHub repos return f"Stored {stored_count} remote sources in {repo}." @mcp.tool() -def fetch_sources(owner: str = Field(description="The owner of the GitHub repository", default=""), repo: str = Field(description="The name of the GitHub repository", default="")): +def fetch_sources(owner: str = Field(description="The owner of the GitHub repository"), + repo: str = Field(description="The name of the GitHub repository")): """ Fetch all sources from the repo """ @@ -192,11 +195,11 @@ def fetch_sources(owner: str = Field(description="The owner of the GitHub reposi return json.dumps(backend.get_sources(repo)) @mcp.tool() -def add_source_notes(owner: str = Field(description="The owner of the GitHub repository", default=""), - repo: str = Field(description="The name of the GitHub repository", default=""), +def add_source_notes(owner: str = Field(description="The owner of the GitHub repository"), + repo: str = Field(description="The name of the GitHub repository"), source_location: str = Field(description="The path to the file"), line: int = Field(description="The line number of the source"), - notes: str = Field(description="The notes to append to this source", default="")): + notes: str = Field(description="The notes to append to this source")): """ Add new notes to an existing source. The notes will be appended to any existing notes. """ @@ -204,7 +207,8 @@ def add_source_notes(owner: str = Field(description="The owner of the GitHub rep return backend.store_new_source(repo = repo, source_location = source_location, line = line, source_type = "", notes = notes, update=True) @mcp.tool() -def clear_codeql_repo(owner: str = Field(description="The owner of the GitHub repository", default=""), repo: str = Field(description="The name of the GitHub repository", default="")): +def clear_codeql_repo(owner: str = Field(description="The owner of the GitHub repository"), + repo: str = Field(description="The name of the GitHub repository")): """ Clear all data for a given repo from the database """ diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index 667d3eb..1a8f129 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -94,4 +94,3 @@ taskflow: Fetch the sources of the repo {{ GLOBALS_repo }} and give a summary of the notes. toolboxes: - seclab_taskflows.toolboxes.codeql_python - - seclab_taskflow_agent.toolboxes.memcache From 22ba2d270224782f995f48ab6847e983c9fb6ebc Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 12:54:30 +0000 Subject: [PATCH 21/26] Add license --- src/seclab_taskflows/mcp_servers/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/seclab_taskflows/mcp_servers/utils.py b/src/seclab_taskflows/mcp_servers/utils.py index adec991..ca293e2 100644 --- a/src/seclab_taskflows/mcp_servers/utils.py +++ b/src/seclab_taskflows/mcp_servers/utils.py @@ -1,2 +1,5 @@ +# SPDX-FileCopyrightText: 2025 GitHub +# SPDX-License-Identifier: MIT + def process_repo(owner, repo): return f"{owner}/{repo}".lower() From 621deb8d8dbfcbfde55e4c3d265be9fb4a1f245b Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 13:55:52 +0100 Subject: [PATCH 22/26] Add process_repo docstring Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/seclab_taskflows/mcp_servers/utils.py b/src/seclab_taskflows/mcp_servers/utils.py index ca293e2..528f9c4 100644 --- a/src/seclab_taskflows/mcp_servers/utils.py +++ b/src/seclab_taskflows/mcp_servers/utils.py @@ -2,4 +2,14 @@ # SPDX-License-Identifier: MIT def process_repo(owner, repo): + """ + Normalize repository identifier to lowercase format 'owner/repo'. + + Args: + owner (str): The owner of the repository. + repo (str): The name of the repository. + + Returns: + str: The normalized repository identifier in lowercase. + """ return f"{owner}/{repo}".lower() From d72359be75d7d6d9cffa734df68580bb8c21cd62 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 14:14:22 +0100 Subject: [PATCH 23/26] Update src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index f152a08..6981ffc 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -70,7 +70,7 @@ def _resolve_db_path(relative_db_path: str | Path): relative_db_path = str(relative_db_path).strip().lstrip('/') relative_db_path = Path(relative_db_path) absolute_path = (CODEQL_DBS_BASE_PATH / relative_db_path).resolve() - if not str(absolute_path).startswith(str(CODEQL_DBS_BASE_PATH.resolve())): + if not absolute_path.is_relative_to(CODEQL_DBS_BASE_PATH.resolve()): raise RuntimeError(f"Error: Database path {absolute_path} is outside the base path {CODEQL_DBS_BASE_PATH}") if not absolute_path.is_dir(): _debug_log(f"Database path not found: {absolute_path}") From b30cbab46088d53fba0cad44619f6812ea35179c Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 14:14:43 +0100 Subject: [PATCH 24/26] Update src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml index 1a8f129..cef6bdc 100644 --- a/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml +++ b/src/seclab_taskflows/taskflows/audit/remote_sources_local.yaml @@ -50,7 +50,7 @@ taskflow: agents: - seclab_taskflow_agent.personalities.assistant user_prompt: | - Fetch the sources from the repo {{ GLOBALS_repo }}. + Fetch the sources from the repo {{ GLOBALS_repo }}. toolboxes: - seclab_taskflows.toolboxes.codeql_python - task: From 7b37b413f476ce0e7d7a9657b98e62ab515fa983 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 14:14:59 +0100 Subject: [PATCH 25/26] Update src/seclab_taskflows/mcp_servers/codeql_python/README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/seclab_taskflows/mcp_servers/codeql_python/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/README.md b/src/seclab_taskflows/mcp_servers/codeql_python/README.md index 7f5a19d..d8c0f4a 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/README.md +++ b/src/seclab_taskflows/mcp_servers/codeql_python/README.md @@ -18,7 +18,7 @@ It will most likely look similar to this: - create or update your `.env` file in the root of this project with values for: ``` COPILOT_TOKEN= # a fine-grained GitHub personal access token with permission for "copilot chat" -CODEQL_DBS_BASE_PATH="/workspaces/seclab-taskflows/data/codeql_databases" #path to folder with your CodeQL databases +CODEQL_DBS_BASE_PATH="/workspaces/seclab-taskflows/data/codeql_databases" # path to folder with your CodeQL databases # Example values for a local setup, run with `python -m seclab_taskflow_agent -t seclab_taskflows.taskflows.audit.remote_sources_local` MEMCACHE_STATE_DIR="/workspaces/seclab-taskflows/data" # path to folder for storing the memcache database From 44941a55f0356c5ae1438bf59ff8f5eb0ae54566 Mon Sep 17 00:00:00 2001 From: Sylwia Budzynska <102833689+sylwia-budzynska@users.noreply.github.com> Date: Wed, 3 Dec 2025 17:48:34 +0000 Subject: [PATCH 26/26] Improve error handling --- src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py index 6981ffc..2ee817a 100644 --- a/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py +++ b/src/seclab_taskflows/mcp_servers/codeql_python/mcp_server.py @@ -126,7 +126,7 @@ def _csv_parse(raw): this_obj[k.strip()] = row[j + 2] results.append(this_obj) except (csv.Error, IndexError, ValueError) as e: - return ["Error: CSV parsing error: " + str(e)] + return f"Error: CSV parsing error: {e}" return results @@ -167,8 +167,6 @@ def remote_sources(owner: str = Field(description="The owner of the GitHub repos # Check if results is an error (list of strings) or valid data (list of dicts) if isinstance(results, str): return f"Error: {results}" - if results and isinstance(results[0], str): - return f"Error: {results[0]}" # Store each result as a source stored_count = 0