From 7e03cfc4484976042d719e373fca737505f00b37 Mon Sep 17 00:00:00 2001 From: Saurabh Sinha Date: Fri, 15 May 2026 12:40:38 -0400 Subject: [PATCH 1/4] Implementation of taint analysis with CodeQL, along with tests and fixtures. Signed-off-by: Saurabh Sinha --- codeanalyzer/__main__.py | 22 + codeanalyzer/config/taint_config_defaults.py | 189 ++++ codeanalyzer/config/taint_config_loader.py | 317 +++++++ codeanalyzer/core.py | 93 +- codeanalyzer/options/options.py | 2 + codeanalyzer/schema/py_schema.py | 377 +++++++- .../codeql/codeql_analysis.py | 178 +++- .../codeql/codeql_query_runner.py | 1 + .../codeql/taint_query_generator.py | 428 +++++++++ test/conftest.py | 155 ++++ .../command_injection_app/vulnerable.py | 172 ++++ .../taint_analysis/flask_app/vulnerable.py | 231 +++++ .../path_traversal_app/vulnerable.py | 189 ++++ .../taint_analysis/sanitizer_app/mixed.py | 114 +++ .../taint_analysis/sanitizer_app/safe.py | 201 +++++ .../sql_injection_app/vulnerable.py | 159 ++++ .../taint_analysis/xss_app/vulnerable.py | 217 +++++ test/test_cli.py | 128 ++- test/test_taint_analysis.py | 841 ++++++++++++++++++ 19 files changed, 4004 insertions(+), 10 deletions(-) create mode 100644 codeanalyzer/config/taint_config_defaults.py create mode 100644 codeanalyzer/config/taint_config_loader.py create mode 100644 codeanalyzer/semantic_analysis/codeql/taint_query_generator.py create mode 100644 test/fixtures/taint_analysis/command_injection_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/flask_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/path_traversal_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/sanitizer_app/mixed.py create mode 100644 test/fixtures/taint_analysis/sanitizer_app/safe.py create mode 100644 test/fixtures/taint_analysis/sql_injection_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/xss_app/vulnerable.py create mode 100644 test/test_taint_analysis.py diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index 19e7f2a..02b25ae 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -27,9 +27,20 @@ def main( case_sensitive=False, ), ] = OutputFormat.JSON, + analysis_level: Annotated[ + int, + typer.Option("-a", "--analysis-level", help="1: symbol table, 2: call graph (requires --codeql), 3: taint analysis (requires --codeql)."), + ] = 1, using_codeql: Annotated[ bool, typer.Option("--codeql/--no-codeql", help="Enable CodeQL-based analysis.") ] = False, + taint_config: Annotated[ + Optional[Path], + typer.Option( + "--taint-config", + help="Path to taint analysis configuration file (YAML or JSON). Used with --analysis-level 3.", + ), + ] = None, using_ray: Annotated[ bool, typer.Option("--ray/--no-ray", help="Enable Ray for distributed analysis."), @@ -74,10 +85,20 @@ def main( int, typer.Option("-v", count=True, help="Increase verbosity: -v, -vv, -vvv") ] = 0, ): + # Validate analysis level requirements + if analysis_level >= 2 and not using_codeql: + logger.error("Analysis levels 2 and 3 require --codeql flag") + raise typer.Exit(code=1) + + if analysis_level >= 3 and taint_config and not taint_config.exists(): + logger.error(f"Taint configuration file '{taint_config}' does not exist.") + raise typer.Exit(code=1) + options = AnalysisOptions( input=input, output=output, format=format, + analysis_level=analysis_level, using_codeql=using_codeql, using_ray=using_ray, rebuild_analysis=rebuild_analysis, @@ -86,6 +107,7 @@ def main( cache_dir=cache_dir, clear_cache=clear_cache, verbosity=verbosity, + taint_config=taint_config, ) _set_log_level(options.verbosity) diff --git a/codeanalyzer/config/taint_config_defaults.py b/codeanalyzer/config/taint_config_defaults.py new file mode 100644 index 0000000..c8cf599 --- /dev/null +++ b/codeanalyzer/config/taint_config_defaults.py @@ -0,0 +1,189 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Default taint analysis configuration. + +Design +------ +The generated CodeQL query uses CodeQL's built-in security models as the +primary detection layer (``RemoteFlowSource``, ``SqlInjection::Sink``, +``CommandInjection::Sink``, ``CodeInjection::Sink``, ``PathTraversal::Sink``, +``XSS::Sink``). These cover hundreds of APIs automatically. + +The patterns defined here are **supplementary** — they extend built-in +coverage with sources/sinks that are not modelled by CodeQL out of the box: + +Sources not in RemoteFlowSource: + - ``sys.argv`` — command-line arguments + - ``input()`` — interactive user input + - ``os.getenv()`` — environment variables + - ``os.environ.get()`` — environment variables + - ``requests.*`` — outbound HTTP responses used as data sources + +Sinks not in built-in models (project-specific or less common): + - ``ldap.search()`` — LDAP injection + +Sanitizers: + - Common HTML/path/command sanitizers that CodeQL may not model as barriers. + +Users can extend or override this configuration via a YAML/JSON file passed +with ``--taint-config``. All CodeQL patterns must use double-quoted strings. +""" + +from codeanalyzer.schema.py_schema import ( + TaintAnalysisConfig, + TaintSourceConfig, + TaintSinkConfig, + TaintSanitizerConfig, +) + + +def get_default_taint_config() -> TaintAnalysisConfig: + """Returns the default taint analysis configuration. + + Combines CodeQL's built-in security models (primary) with supplementary + user-configured patterns for sources/sinks not covered by the built-ins. + + Returns: + TaintAnalysisConfig: Default configuration + """ + + return TaintAnalysisConfig( + sources=[ + # --- Sources not covered by CodeQL's RemoteFlowSource --- + + # Command-line arguments + TaintSourceConfig( + name="command_line_args", + description="Command-line arguments via sys.argv", + pattern='API::moduleImport("sys").getMember("argv")', + source_type="command_line_argument", + ), + + # Interactive user input + TaintSourceConfig( + name="user_input", + description="Direct user input via input() function", + pattern='API::builtin("input").getACall()', + source_type="user_input", + ), + + # Environment variables + TaintSourceConfig( + name="env_getenv", + description="Environment variables via os.getenv", + pattern='API::moduleImport("os").getMember("getenv").getACall()', + source_type="environment_variable", + ), + TaintSourceConfig( + name="env_environ_get", + description="Environment variables via os.environ.get", + pattern='API::moduleImport("os").getMember("environ").getMember("get").getACall()', + source_type="environment_variable", + ), + + # Outbound HTTP responses used as data sources (requests library) + TaintSourceConfig( + name="requests_get_response", + description="HTTP GET response body (requests.get().text / .json())", + pattern='API::moduleImport("requests").getMember("get").getReturn().getMember("text")', + source_type="http_response", + ), + TaintSourceConfig( + name="requests_post_response", + description="HTTP POST response body (requests.post().text / .json())", + pattern='API::moduleImport("requests").getMember("post").getReturn().getMember("text")', + source_type="http_response", + ), + ], + + sinks=[ + # --- Sinks not covered by CodeQL's built-in sink classes --- + + # LDAP Injection (not in CodeQL's standard Python models) + TaintSinkConfig( + name="ldap_search", + description="LDAP search operations", + pattern='API::moduleImport("ldap").getMember("search").getACall()', + sink_type="ldap_query", + vulnerability_type="LDAP Injection", + severity="high", + argument_index=0, + ), + TaintSinkConfig( + name="ldap3_connection_search", + description="ldap3 Connection.search", + pattern='API::moduleImport("ldap3").getMember("Connection").getReturn().getMember("search").getACall()', + sink_type="ldap_query", + vulnerability_type="LDAP Injection", + severity="high", + argument_index=1, + ), + ], + + sanitizers=[ + # HTML / XSS sanitizers + TaintSanitizerConfig( + name="html_escape", + description="HTML escape function (html.escape)", + pattern='API::moduleImport("html").getMember("escape").getACall()', + sanitizes=["xss", "template_injection"], + ), + TaintSanitizerConfig( + name="markupsafe_escape", + description="MarkupSafe Markup() / escape()", + pattern='API::moduleImport("markupsafe").getMember("escape").getACall()', + sanitizes=["xss"], + ), + + # Command injection sanitizers + TaintSanitizerConfig( + name="shlex_quote", + description="Shell argument quoting via shlex.quote", + pattern='API::moduleImport("shlex").getMember("quote").getACall()', + sanitizes=["command_injection"], + ), + + # Path traversal sanitizers + TaintSanitizerConfig( + name="os_path_normpath", + description="Path normalization via os.path.normpath", + pattern='API::moduleImport("os").getMember("path").getMember("normpath").getACall()', + sanitizes=["path_traversal"], + ), + TaintSanitizerConfig( + name="os_path_abspath", + description="Absolute path resolution via os.path.abspath", + pattern='API::moduleImport("os").getMember("path").getMember("abspath").getACall()', + sanitizes=["path_traversal"], + ), + TaintSanitizerConfig( + name="pathlib_resolve", + description="Path resolution via pathlib.Path.resolve()", + pattern='API::moduleImport("pathlib").getMember("Path").getReturn().getMember("resolve").getACall()', + sanitizes=["path_traversal"], + ), + ], + + # Analysis options + max_path_length=10, + include_implicit_flows=False, + confidence_threshold="medium", + exclude_files=[], + exclude_functions=[], + include_safe_flows=False, + group_by_vulnerability=True, + ) diff --git a/codeanalyzer/config/taint_config_loader.py b/codeanalyzer/config/taint_config_loader.py new file mode 100644 index 0000000..120f1a1 --- /dev/null +++ b/codeanalyzer/config/taint_config_loader.py @@ -0,0 +1,317 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Taint analysis configuration loader. + +This module provides functionality to load taint analysis configurations from +YAML or JSON files and merge them with default configurations. +""" + +import json +from pathlib import Path +from typing import Optional, Union + +import yaml + +from codeanalyzer.schema.py_schema import TaintAnalysisConfig +from codeanalyzer.config.taint_config_defaults import get_default_taint_config +from codeanalyzer.utils import logger + + +class TaintConfigLoader: + """Loads and merges taint analysis configurations.""" + + @staticmethod + def load_config( + config_path: Optional[Union[str, Path]] = None, + use_defaults: bool = True + ) -> TaintAnalysisConfig: + """Load taint analysis configuration. + + Args: + config_path: Path to custom configuration file (YAML or JSON). + If None, only defaults are used. + use_defaults: Whether to include default sources/sinks/sanitizers. + If True, custom config extends defaults. + If False, only custom config is used. + + Returns: + TaintAnalysisConfig: Merged configuration + + Raises: + FileNotFoundError: If config_path is provided but file doesn't exist + ValueError: If file format is unsupported or invalid + """ + # Start with defaults if requested + if use_defaults: + config = get_default_taint_config() + logger.debug(f"Loaded default taint configuration with {len(config.sources)} sources, " + f"{len(config.sinks)} sinks, {len(config.sanitizers)} sanitizers") + else: + config = TaintAnalysisConfig() + logger.debug("Starting with empty taint configuration") + + # Load and merge custom configuration + if config_path: + custom_config = TaintConfigLoader._load_from_file(config_path) + config = TaintConfigLoader._merge_configs(config, custom_config) + logger.info(f"Merged custom configuration from {config_path}") + + # Filter out disabled items + config = TaintConfigLoader._filter_disabled(config) + + logger.info(f"Final taint configuration: {len(config.sources)} sources, " + f"{len(config.sinks)} sinks, {len(config.sanitizers)} sanitizers") + + return config + + @staticmethod + def _load_from_file(config_path: Union[str, Path]) -> TaintAnalysisConfig: + """Load configuration from YAML or JSON file. + + Args: + config_path: Path to configuration file + + Returns: + TaintAnalysisConfig: Loaded configuration + + Raises: + FileNotFoundError: If file doesn't exist + ValueError: If file format is unsupported or invalid + """ + path = Path(config_path) + + if not path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + logger.debug(f"Loading taint configuration from {path}") + content = path.read_text() + + # Parse based on file extension + try: + if path.suffix in ['.yaml', '.yml']: + data = yaml.safe_load(content) + elif path.suffix == '.json': + data = json.loads(content) + else: + raise ValueError( + f"Unsupported configuration format: {path.suffix}. " + f"Supported formats: .yaml, .yml, .json" + ) + except yaml.YAMLError as e: + raise ValueError(f"Invalid YAML in configuration file: {e}") + except json.JSONDecodeError as e: + raise ValueError(f"Invalid JSON in configuration file: {e}") + + # Convert to Pydantic model + try: + return TaintAnalysisConfig.model_validate(data) + except Exception as e: + raise ValueError(f"Invalid taint configuration structure: {e}") + + @staticmethod + def _merge_configs( + base: TaintAnalysisConfig, + custom: TaintAnalysisConfig + ) -> TaintAnalysisConfig: + """Merge custom configuration into base configuration. + + Custom sources/sinks/sanitizers are added to the base. + If a custom item has the same name as a base item, it overrides it. + + Args: + base: Base configuration (typically defaults) + custom: Custom configuration to merge in + + Returns: + TaintAnalysisConfig: Merged configuration + """ + # Create name-based lookups for base config + base_sources = {s.name: s for s in base.sources} + base_sinks = {s.name: s for s in base.sinks} + base_sanitizers = {s.name: s for s in base.sanitizers} + + # Track what was overridden + overridden_sources = [] + overridden_sinks = [] + overridden_sanitizers = [] + + # Merge sources + for source in custom.sources: + if source.name in base_sources: + overridden_sources.append(source.name) + base_sources[source.name] = source + + # Merge sinks + for sink in custom.sinks: + if sink.name in base_sinks: + overridden_sinks.append(sink.name) + base_sinks[sink.name] = sink + + # Merge sanitizers + for sanitizer in custom.sanitizers: + if sanitizer.name in base_sanitizers: + overridden_sanitizers.append(sanitizer.name) + base_sanitizers[sanitizer.name] = sanitizer + + # Log merge information + if overridden_sources: + logger.debug(f"Overridden sources: {', '.join(overridden_sources)}") + if overridden_sinks: + logger.debug(f"Overridden sinks: {', '.join(overridden_sinks)}") + if overridden_sanitizers: + logger.debug(f"Overridden sanitizers: {', '.join(overridden_sanitizers)}") + + # Merge exclude lists (combine both) + merged_exclude_files = list(set(base.exclude_files + custom.exclude_files)) + merged_exclude_functions = list(set(base.exclude_functions + custom.exclude_functions)) + + # Create merged config + # Use custom values for options if they differ from defaults + return TaintAnalysisConfig( + sources=list(base_sources.values()), + sinks=list(base_sinks.values()), + sanitizers=list(base_sanitizers.values()), + max_path_length=custom.max_path_length if custom.max_path_length != 10 else base.max_path_length, + include_implicit_flows=custom.include_implicit_flows or base.include_implicit_flows, + confidence_threshold=custom.confidence_threshold if custom.confidence_threshold != "medium" else base.confidence_threshold, + exclude_files=merged_exclude_files, + exclude_functions=merged_exclude_functions, + include_safe_flows=custom.include_safe_flows or base.include_safe_flows, + group_by_vulnerability=custom.group_by_vulnerability if not custom.group_by_vulnerability else base.group_by_vulnerability, + ) + + @staticmethod + def _filter_disabled(config: TaintAnalysisConfig) -> TaintAnalysisConfig: + """Filter out disabled sources, sinks, and sanitizers. + + Args: + config: Configuration to filter + + Returns: + TaintAnalysisConfig: Filtered configuration with only enabled items + """ + enabled_sources = [s for s in config.sources if s.enabled] + enabled_sinks = [s for s in config.sinks if s.enabled] + enabled_sanitizers = [s for s in config.sanitizers if s.enabled] + + disabled_count = ( + len(config.sources) - len(enabled_sources) + + len(config.sinks) - len(enabled_sinks) + + len(config.sanitizers) - len(enabled_sanitizers) + ) + + if disabled_count > 0: + logger.debug(f"Filtered out {disabled_count} disabled items") + + return TaintAnalysisConfig( + sources=enabled_sources, + sinks=enabled_sinks, + sanitizers=enabled_sanitizers, + max_path_length=config.max_path_length, + include_implicit_flows=config.include_implicit_flows, + confidence_threshold=config.confidence_threshold, + exclude_files=config.exclude_files, + exclude_functions=config.exclude_functions, + include_safe_flows=config.include_safe_flows, + group_by_vulnerability=config.group_by_vulnerability, + ) + + @staticmethod + def save_config( + config: TaintAnalysisConfig, + output_path: Union[str, Path], + format: str = "yaml" + ) -> None: + """Save configuration to file. + + Args: + config: Configuration to save + output_path: Path where to save the configuration + format: Output format ('yaml' or 'json') + + Raises: + ValueError: If format is unsupported + """ + path = Path(output_path) + + # Ensure parent directory exists + path.parent.mkdir(parents=True, exist_ok=True) + + if format.lower() in ['yaml', 'yml']: + content = yaml.dump( + config.model_dump(), + default_flow_style=False, + sort_keys=False, + indent=2 + ) + elif format.lower() == 'json': + content = config.model_dump_json(indent=2) + else: + raise ValueError(f"Unsupported format: {format}. Use 'yaml' or 'json'") + + path.write_text(content) + logger.info(f"Saved taint configuration to {path}") + + @staticmethod + def validate_config(config: TaintAnalysisConfig) -> list[str]: + """Validate configuration and return list of warnings/errors. + + Args: + config: Configuration to validate + + Returns: + list[str]: List of validation issues (empty if valid) + """ + issues = [] + + # Check for duplicate names + source_names = [s.name for s in config.sources] + if len(source_names) != len(set(source_names)): + duplicates = [name for name in source_names if source_names.count(name) > 1] + issues.append(f"Duplicate source names found: {', '.join(set(duplicates))}") + + sink_names = [s.name for s in config.sinks] + if len(sink_names) != len(set(sink_names)): + duplicates = [name for name in sink_names if sink_names.count(name) > 1] + issues.append(f"Duplicate sink names found: {', '.join(set(duplicates))}") + + sanitizer_names = [s.name for s in config.sanitizers] + if len(sanitizer_names) != len(set(sanitizer_names)): + duplicates = [name for name in sanitizer_names if sanitizer_names.count(name) > 1] + issues.append(f"Duplicate sanitizer names found: {', '.join(set(duplicates))}") + + # Validate patterns are not empty + for source in config.sources: + if not source.pattern.strip(): + issues.append(f"Empty pattern for source: {source.name}") + + for sink in config.sinks: + if not sink.pattern.strip(): + issues.append(f"Empty pattern for sink: {sink.name}") + + for sanitizer in config.sanitizers: + if not sanitizer.pattern.strip(): + issues.append(f"Empty pattern for sanitizer: {sanitizer.name}") + + # Check if there are any sources and sinks + if not config.sources: + issues.append("No taint sources configured") + + if not config.sinks: + issues.append("No taint sinks configured") + + return issues diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index b8cfcca..51ccecd 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -70,6 +70,7 @@ def __init__(self, options: AnalysisOptions) -> None: self.virtualenv: Optional[Path] = None self.using_ray: bool = options.using_ray self.file_name: Optional[Path] = options.file_name + self.analysis_depth: int = options.analysis_level @staticmethod def _cmd_exec_helper( @@ -361,11 +362,21 @@ def __exit__(self, *args, **kwargs) -> None: def analyze(self) -> PyApplication: """Analyze the project and return a PyApplication with symbol table. + Analysis levels: + - Level 1: Symbol table only + - Level 2: Symbol table + call graph (requires CodeQL) + - Level 3: Symbol table + call graph + taint analysis (requires CodeQL) + Uses caching to avoid re-analyzing unchanged files. """ + # Validate analysis level requirements + if self.analysis_depth >= 2 and not self.using_codeql: + logger.error("Analysis levels 2 and 3 require --codeql flag") + raise ValueError("CodeQL is required for analysis levels 2 and above") + cache_file = self.cache_dir / "analysis_cache.json" - # Try to load existing cached analysis + # Try to load existing cached analysis cached_pyapplication = None if not self.rebuild_analysis and cache_file.exists(): try: @@ -375,7 +386,7 @@ def analyze(self) -> PyApplication: logger.warning(f"Failed to load cache: {e}. Rebuilding analysis.") cached_pyapplication = None - # Build symbol table from cached application if available (if no available, the build a new one) + # Level 1: Build symbol table symbol_table = self._build_symbol_table(cached_pyapplication.symbol_table if cached_pyapplication else {}) # Build the call graph in four steps: @@ -399,10 +410,17 @@ def analyze(self) -> PyApplication: # Recreate pyapplication app = PyApplication.builder().symbol_table(symbol_table).call_graph(call_graph).build() - + + # Level 3: Add taint analysis (if CodeQL is enabled) + if self.analysis_depth >= 3 and self.using_codeql: + logger.info("Performing taint analysis (Level 3)...") + taint_results = self._perform_taint_analysis(symbol_table=symbol_table) + app.taint_analysis = taint_results + logger.info(f"✅ Taint analysis complete. Found {len(taint_results.flows)} flows.") + # Save to cache self._save_analysis_cache(app, cache_file) - + return app def _load_pyapplication_from_cache(self, cache_file: Path) -> PyApplication: @@ -717,4 +735,69 @@ def _get_call_graph( return edges except Exception as exc: logger.warning(f"CodeQL call-graph extraction failed: {exc}") - return [] \ No newline at end of file + return [] + + def _perform_taint_analysis(self, symbol_table: Optional[Dict[str, PyModule]] = None): + """Perform taint analysis using CodeQL. + + Args: + symbol_table: Optional symbol table from analysis level 1. When + provided, taint sources and sinks are resolved to the matching + ``PyCallsite`` objects already captured during syntactic analysis. + + Returns: + PyTaintAnalysisResult: Complete taint analysis results + + Raises: + ValueError: If CodeQL database is not available + """ + from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL + from codeanalyzer.config.taint_config_loader import TaintConfigLoader + from codeanalyzer.schema.py_schema import PyTaintAnalysisResult + + if not self.db_path: + raise ValueError("CodeQL database not available for taint analysis") + + # Load taint configuration + if self.options.taint_config: + logger.info(f"Loading taint configuration from {self.options.taint_config}") + taint_config = TaintConfigLoader.load_config( + self.options.taint_config, + use_defaults=True + ) + else: + logger.info("Using default taint analysis configuration") + taint_config = TaintConfigLoader.load_config(use_defaults=True) + + # Log configuration summary + logger.info(f"Taint analysis configuration:") + logger.info(f" - Sources: {len(taint_config.sources)}") + logger.info(f" - Sinks: {len(taint_config.sinks)}") + logger.info(f" - Sanitizers: {len(taint_config.sanitizers)}") + + # Perform analysis + codeql = CodeQL( + project_dir=self.project_dir, + db_path=self.db_path, + codeql_bin=self.codeql_bin, + codeql_packs_dir=self.codeql_packs_dir, + taint_config=taint_config, + ) + + results = codeql.analyze_taint_flows(symbol_table=symbol_table) + + # Log summary + logger.info(f"Taint analysis summary:") + logger.info(f" - Total flows detected: {len(results.flows)}") + + n_critical = sum(1 for f in results.flows if f.severity == "critical") + n_high = sum(1 for f in results.flows if f.severity == "high") + n_medium = sum(1 for f in results.flows if f.severity == "medium") + n_low = sum(1 for f in results.flows if f.severity == "low") + if results.flows: + logger.info(f" - Critical: {n_critical}") + logger.info(f" - High: {n_high}") + logger.info(f" - Medium: {n_medium}") + logger.info(f" - Low: {n_low}") + + return results diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index 1602d45..e4d32e8 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -14,6 +14,7 @@ class AnalysisOptions: input: Path output: Optional[Path] = None format: OutputFormat = OutputFormat.JSON + analysis_level: int = 1 using_codeql: bool = False using_ray: bool = False rebuild_analysis: bool = False @@ -22,3 +23,4 @@ class AnalysisOptions: cache_dir: Optional[Path] = None clear_cache: bool = False verbosity: int = 0 + taint_config: Optional[Path] = None diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py index 8bef391..6dd004b 100644 --- a/codeanalyzer/schema/py_schema.py +++ b/codeanalyzer/schema/py_schema.py @@ -339,6 +339,374 @@ class PyModule(BaseModel): file_size: Optional[int] = None +# ============================================================================ +# Taint Analysis Models (Analysis Level 3) +# ============================================================================ + +@builder +@msgpk +class TaintSourceConfig(BaseModel): + """Configuration entry that tells the CodeQL query generator where + untrusted data can enter the application. + + Each entry is turned into a predicate clause inside the generated + ``isConfiguredSource`` CodeQL predicate. + """ + + name: str + """Unique identifier for this source entry (used for logging and deduplication).""" + + description: str + """Human-readable explanation of what this source represents.""" + + pattern: str + """CodeQL API-graph expression that matches the source call site. + + Must be a valid CodeQL expression that evaluates to a ``DataFlow::Node``, + e.g. ``API::builtin("input").getACall()`` or + ``API::moduleImport("flask").getMember("request").getMember("args").asSource()``. + All string literals inside the pattern must use double quotes (CodeQL + does not support single-quoted strings). + """ + + source_type: str + """Logical category label attached to every flow that originates here. + + Examples: ``"user_input"``, ``"web_request"``, ``"environment_variable"``, + ``"file_read"``, ``"http_request"``. The label is propagated to + ``PyTaintSource.source_type`` in the analysis results. + """ + + enabled: bool = True + """When ``False`` this entry is filtered out before query generation.""" + + +@builder +@msgpk +class TaintSinkConfig(BaseModel): + """Configuration entry that tells the CodeQL query generator where + tainted data reaching this call site would be dangerous. + + Each entry is turned into a predicate clause inside the generated + ``isConfiguredSink`` CodeQL predicate. + """ + + name: str + """Unique identifier for this sink entry (used for logging and deduplication).""" + + description: str + """Human-readable explanation of what this sink represents.""" + + pattern: str + """CodeQL API-graph expression that matches the sink call site. + + Must be a valid CodeQL expression that evaluates to a ``DataFlow::Node``, + e.g. ``API::moduleImport("sqlite3").getMember("execute").getACall()``. + All string literals inside the pattern must use double quotes. + """ + + sink_type: str + """Logical category label attached to every flow that terminates here. + + Examples: ``"sql_execution"``, ``"command_execution"``, ``"code_execution"``, + ``"file_access"``, ``"template_rendering"``. The label is propagated to + ``PyTaintSink.sink_type`` in the analysis results. + """ + + vulnerability_type: str + """Human-readable vulnerability class reported in the analysis results. + + Examples: ``"SQL Injection"``, ``"Command Injection"``, ``"Path Traversal"``, + ``"Cross-Site Scripting (XSS)"``, ``"Code Injection"``. + """ + + severity: Literal["critical", "high", "medium", "low"] + """Risk level of a confirmed taint flow reaching this sink. + + Propagated verbatim to ``PyTaintSink.severity`` and ``PyTaintFlow.severity``. + """ + + enabled: bool = True + """When ``False`` this entry is filtered out before query generation.""" + + argument_index: Optional[int] = None + """Zero-based index of the argument that must be tainted for the sink to fire. + + When set, the generated predicate uses + ``pattern.getParameter(argument_index).asSink()`` so that only the + specific argument position is tracked (e.g. index ``0`` for the query + string in ``cursor.execute(query, params)``). When ``None`` the call + itself is used as the sink node. + """ + + +@builder +@msgpk +class TaintSanitizerConfig(BaseModel): + """Configuration entry that tells the CodeQL query generator which + call sites act as sanitizers, blocking taint propagation. + + Each entry is turned into a predicate clause inside the generated + ``isConfiguredSanitizer`` CodeQL predicate. + """ + + name: str + """Unique identifier for this sanitizer entry.""" + + description: str + """Human-readable explanation of what this sanitizer does.""" + + pattern: str + """CodeQL API-graph expression that matches the sanitizing call site. + + Must be a valid CodeQL expression that evaluates to a ``DataFlow::Node``, + e.g. ``API::moduleImport("html").getMember("escape").getACall()``. + All string literals inside the pattern must use double quotes. + """ + + sanitizes: List[str] = [] + """Informational list of vulnerability types this sanitizer mitigates. + + Not used by the CodeQL query generator (all enabled sanitizers block all + flows); present for documentation and future fine-grained filtering. + Examples: ``["xss", "template_injection"]``, ``["command_injection"]``. + """ + + enabled: bool = True + """When ``False`` this entry is filtered out before query generation.""" + + +@builder +@msgpk +class TaintAnalysisConfig(BaseModel): + """Complete, self-contained configuration for a taint analysis run. + + Passed to ``TaintQueryGenerator.generate_query()`` which turns it into a + single executable CodeQL query. All three lists are filtered to remove + disabled entries before query generation. + """ + + sources: List[TaintSourceConfig] = [] + """Ordered list of taint source definitions. At least one enabled source + is required for the analysis to produce results.""" + + sinks: List[TaintSinkConfig] = [] + """Ordered list of taint sink definitions. At least one enabled sink is + required for the analysis to produce results.""" + + sanitizers: List[TaintSanitizerConfig] = [] + """Ordered list of sanitizer definitions. May be empty; when non-empty + the generated query will not report flows that pass through a sanitizer.""" + + max_path_length: int = 10 + """Maximum number of intermediate steps in a reported taint path. + Longer paths are still detected but truncated in the output.""" + + include_implicit_flows: bool = False + """Whether to track implicit (control-flow) taint in addition to explicit + (data-flow) taint. Enabling this increases recall but also false positives.""" + + confidence_threshold: Literal["high", "medium", "low"] = "medium" + """Minimum confidence level for a flow to be included in the results. + Currently informational; all flows are reported regardless of this value.""" + + exclude_files: List[str] = [] + """Glob patterns for source files to exclude from analysis (e.g. test files).""" + + exclude_functions: List[str] = [] + """Qualified function names to exclude as sources or sinks.""" + + include_safe_flows: bool = False + """When ``True``, also report flows that pass through a sanitizer. + Useful for auditing sanitizer coverage.""" + + group_by_vulnerability: bool = True + """When ``True``, results are grouped by vulnerability type in log output.""" + + +@builder +@msgpk +class PyTaintSource(BaseModel): + """Represents a taint source - where untrusted data enters the system. + + Sources are always call sites (e.g. ``input()``, ``request.args.get()``, + ``os.getenv()``). The ``call_site`` field captures the full call-site + metadata from the symbol table so that downstream tasks can access + receiver type, argument types, callee signature, and precise location + without duplicating that information here. + """ + + source_type: str + """Logical category of the source (e.g. ``"user_input"``, ``"web_request"``).""" + + call_site: PyCallsite + """The call-site in the symbol table where tainted data originates.""" + + description: Optional[str] = None + + +@builder +@msgpk +class PyTaintSink(BaseModel): + """Represents a taint sink - where tainted data could cause harm. + + Sinks are always call sites (e.g. ``cursor.execute()``, ``os.system()``, + ``eval()``). The ``call_site`` field captures the full call-site metadata + from the symbol table so that downstream tasks can access receiver type, + argument types, callee signature, and precise location without duplicating + that information here. + """ + + sink_type: str + """Logical category of the sink (e.g. ``"sql_execution"``, ``"command_execution"``).""" + + call_site: PyCallsite + """The call-site in the symbol table where tainted data is consumed.""" + + severity: Literal["critical", "high", "medium", "low"] = "medium" + description: Optional[str] = None + + +@builder +@msgpk +class PyTaintFlowStep(BaseModel): + """Represents a single intermediate step in a taint flow path. + + A path is the ordered sequence of program points through which tainted + data travels from a source to a sink. Each step records the location + and role of one such program point. + + Note: the current CodeQL query does not populate intermediate path steps + (``path`` is always empty in ``PyTaintFlow``). This model is reserved + for future path-step extraction. + """ + + location: str + """Absolute file path of the source file containing this step.""" + + function_name: str + """Simple name of the enclosing function or method (``""`` at + module level).""" + + start_line: int = -1 + """1-based line number where this step begins; ``-1`` if unknown.""" + + end_line: int = -1 + """1-based line number where this step ends; ``-1`` if unknown.""" + + start_column: int = -1 + """0-based column offset where this step begins; ``-1`` if unknown.""" + + end_column: int = -1 + """0-based column offset where this step ends; ``-1`` if unknown.""" + + expression: Optional[str] = None + """Source-code expression at this step as a string, if available.""" + + step_type: Literal["source", "propagation", "sink"] = "propagation" + """Role of this step in the flow path. + + * ``"source"`` — the first step; tainted data originates here. + * ``"propagation"`` — an intermediate step; tainted data passes through. + * ``"sink"`` — the last step; tainted data reaches a dangerous operation. + """ + + description: Optional[str] = None + """Optional human-readable description of what happens at this step.""" + + +@builder +@msgpk +class PyTaintFlow(BaseModel): + """Represents a complete, confirmed taint flow from a source to a sink. + + A taint flow means that data originating at ``source`` (an untrusted + input call site) can reach ``sink`` (a dangerous operation call site) + without passing through a sanitizer, as determined by CodeQL's + inter-procedural dataflow analysis. + """ + + flow_id: str + """Stable identifier for this flow, derived from source and sink locations. + + Format: ``":->:"``. + Used for deduplication across incremental analysis runs. + """ + + source: PyTaintSource + """The call site where untrusted data enters the application. + + Carries a ``PyCallsite`` that links back to the symbol table entry + (when the symbol table was available during analysis). + """ + + sink: PyTaintSink + """The call site where tainted data reaches a dangerous operation. + + Carries a ``PyCallsite`` that links back to the symbol table entry + (when the symbol table was available during analysis). + """ + + path: List[PyTaintFlowStep] = [] + """Ordered list of intermediate steps between source and sink. + + Currently always empty — reserved for future path-step extraction. + """ + + vulnerability_type: str + """Human-readable vulnerability class, e.g. ``"SQL Injection"``, + ``"Command Injection"``, ``"Path Traversal"``. + + Derived from the matching ``TaintSinkConfig.vulnerability_type``. + """ + + severity: Literal["critical", "high", "medium", "low"] = "medium" + """Risk level of this flow, inherited from ``TaintSinkConfig.severity``.""" + + confidence: Literal["high", "medium", "low"] = "medium" + """Confidence in the reported flow. Currently always ``"medium"`` + (CodeQL's dataflow analysis is sound but the sink patterns may + over-approximate).""" + + description: Optional[str] = None + """Human-readable summary of the flow, e.g. + ``"Tainted data from user_input flows to SQL Injection"``.""" + + +@builder +@msgpk +class PyTaintAnalysisResult(BaseModel): + """Container for all taint analysis results for a project. + + Source and sink information is embedded in each ``PyTaintFlow`` via + ``flow.source`` and ``flow.sink`` (both of which carry a ``PyCallsite``), + so there is no need for separate top-level source/sink lists. + """ + + project_path: str + """Absolute path to the root of the analysed project.""" + + flows: List[PyTaintFlow] = [] + """All confirmed taint flows detected in the project. + + Each flow represents a path from an untrusted source to a dangerous sink + that was not blocked by a sanitizer. An empty list means no + vulnerabilities were detected with the current configuration. + """ + + analysis_timestamp: Optional[str] = None + """ISO-8601 UTC timestamp of when the analysis completed, e.g. + ``"2025-05-15T14:00:00+00:00"``.""" + + codeql_database_path: Optional[str] = None + """Absolute path to the CodeQL database used for this analysis run. + Useful for reproducing or extending the analysis.""" + + +# ============================================================================ +# Application Model (combines all analysis levels) +# ============================================================================ + @builder @msgpk class PyCallEdge(BaseModel): @@ -361,7 +729,14 @@ class PyCallEdge(BaseModel): @builder @msgpk class PyApplication(BaseModel): - """Represents a Python application.""" + """Represents a Python application with multi-level analysis results. + + Analysis Levels: + - Level 1: symbol_table (syntactic analysis) + - Level 2: call_graph (control flow analysis) - TODO: implement storage + - Level 3: taint_analysis (data flow security analysis) + """ symbol_table: Dict[str, PyModule] call_graph: List[PyCallEdge] = [] + taint_analysis: Optional[PyTaintAnalysisResult] = None diff --git a/codeanalyzer/semantic_analysis/codeql/codeql_analysis.py b/codeanalyzer/semantic_analysis/codeql/codeql_analysis.py index 0c0e046..8d71e62 100644 --- a/codeanalyzer/semantic_analysis/codeql/codeql_analysis.py +++ b/codeanalyzer/semantic_analysis/codeql/codeql_analysis.py @@ -21,14 +21,23 @@ """ from collections import Counter +from datetime import datetime, timezone from pathlib import Path -from typing import Any, Dict, Iterator, List, Tuple, Union +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from pandas import DataFrame -from codeanalyzer.schema.py_schema import PyCallEdge, PyModule +from codeanalyzer.schema.py_schema import PyCallEdge, PyCallsite, PyModule from codeanalyzer.semantic_analysis.call_graph import iter_callables_in_symbol_table from codeanalyzer.semantic_analysis.codeql.codeql_query_runner import CodeQLQueryRunner +from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator +from codeanalyzer.schema.py_schema import ( + TaintAnalysisConfig, + PyTaintAnalysisResult, + PyTaintSource, + PyTaintSink, + PyTaintFlow, +) from codeanalyzer.utils import logger @@ -49,11 +58,13 @@ def __init__( db_path: Path, codeql_bin: Union[str, Path, None] = None, codeql_packs_dir: Union[str, Path, None] = None, + taint_config: Optional[TaintAnalysisConfig] = None, ) -> None: - self.project_dir = project_dir + self.project_dir = Path(project_dir) self.db_path = db_path self.codeql_bin = codeql_bin self.codeql_packs_dir = codeql_packs_dir + self.taint_config = taint_config self._cached_df: "DataFrame | None" = None def _query_call_edges(self) -> DataFrame: @@ -181,6 +192,33 @@ def _build_callable_location_index( index[(abs_path, c.start_line)] = c return index + @staticmethod + def _build_callsite_location_index( + symbol_table: Dict[str, PyModule], + ) -> Dict[Tuple[str, int], PyCallsite]: + """Build ``(absolute_file_path, start_line) -> PyCallsite`` from the symbol table. + + Iterates every ``PyCallsite`` in every ``PyCallable.call_sites`` list so + that taint sources and sinks can be resolved to the rich call-site objects + already captured during syntactic analysis (receiver type, argument types, + callee signature, …). + + Paths are resolved to absolute form to match CodeQL's ``getAbsolutePath()``. + When two call sites share the same (file, start_line) the first one wins + (ambiguity is rare and an approximation is acceptable here). + """ + index: Dict[Tuple[str, int], PyCallsite] = {} + for callable_ in iter_callables_in_symbol_table(symbol_table): + try: + abs_path = str(Path(callable_.path).resolve()) + except (OSError, RuntimeError): + abs_path = callable_.path + for cs in callable_.call_sites: + key = (abs_path, cs.start_line) + if key not in index: + index[key] = cs + return index + def _iter_resolved_rows( self, symbol_table: Dict[str, PyModule] ) -> "Iterator[Tuple[str, str, Any]]": @@ -298,3 +336,137 @@ def augment_call_sites(self, symbol_table: Dict[str, PyModule]) -> int: f"CodeQL: augmented {augmented} PyCallsite.callee_signature entries." ) return augmented + + def analyze_taint_flows( + self, + config_override: Optional[TaintAnalysisConfig] = None, + symbol_table: Optional[Dict[str, PyModule]] = None, + ) -> PyTaintAnalysisResult: + """Perform taint analysis with configurable sources/sinks/sanitizers. + + Args: + config_override: Optional configuration to override instance config. + symbol_table: Optional symbol table produced by analysis level 1. + When provided, taint sources and sinks are resolved to the + matching ``PyCallsite`` objects already captured during syntactic + analysis (giving access to receiver type, argument types, callee + signature, …). If a match cannot be found a new ``PyCallsite`` + is constructed from the CodeQL location data as a fallback. + + Returns: + PyTaintAnalysisResult: Complete taint analysis results + + Raises: + ValueError: If no taint configuration is available + """ + config = config_override or self.taint_config + + if not config: + raise ValueError("No taint configuration provided. Pass config to __init__ or analyze_taint_flows()") + + logger.info("Starting taint analysis...") + logger.debug(f"Configuration: {len(config.sources)} sources, " + f"{len(config.sinks)} sinks, {len(config.sanitizers)} sanitizers") + + # Build callsite index from symbol table for best-effort linkage + callsite_index: Dict[Tuple[str, int], PyCallsite] = ( + self._build_callsite_location_index(symbol_table) + if symbol_table is not None + else {} + ) + if callsite_index: + logger.debug(f"Built callsite index with {len(callsite_index)} entries from symbol table") + + query_string = TaintQueryGenerator.generate_query(config) + column_names = TaintQueryGenerator.get_column_names() + + logger.debug("Executing CodeQL taint analysis query...") + with CodeQLQueryRunner( + self.db_path, + codeql_bin=self.codeql_bin, + codeql_packs_dir=self.codeql_packs_dir, + ) as runner: + result_df = runner.execute(query_string, column_names) + + logger.info(f"Query returned {len(result_df)} taint flows") + + flows = [] + sources_dict: Dict[str, PyTaintSource] = {} + sinks_dict: Dict[str, PyTaintSink] = {} + n_callsite_hits = 0 + + for _, row in result_df.iterrows(): + source_key = f"{row['source_file']}:{row['source_start_line']}" + if source_key not in sources_dict: + # Try to resolve from symbol table; fall back to constructing new + src_cs_key = (row["source_file"], int(row["source_start_line"])) + source_call_site = callsite_index.get(src_cs_key) or PyCallsite( + method_name=row["source_expr"] or row["source_function"], + receiver_expr=None, + start_line=int(row["source_start_line"]), + end_line=int(row["source_end_line"]), + start_column=int(row["source_start_col"]), + end_column=int(row["source_end_col"]), + ) + if src_cs_key in callsite_index: + n_callsite_hits += 1 + source = PyTaintSource( + source_type=row["source_type"], + call_site=source_call_site, + description=f"Untrusted data from {row['source_type']} " + f"in {row['source_qualified_function']} " + f"({row['source_file']}:{row['source_start_line']})", + ) + sources_dict[source_key] = source + + sink_key = f"{row['sink_file']}:{row['sink_start_line']}" + if sink_key not in sinks_dict: + # Try to resolve from symbol table; fall back to constructing new + snk_cs_key = (row["sink_file"], int(row["sink_start_line"])) + sink_call_site = callsite_index.get(snk_cs_key) or PyCallsite( + method_name=row["sink_expr"] or row["sink_function"], + receiver_expr=None, + start_line=int(row["sink_start_line"]), + end_line=int(row["sink_end_line"]), + start_column=int(row["sink_start_col"]), + end_column=int(row["sink_end_col"]), + ) + if snk_cs_key in callsite_index: + n_callsite_hits += 1 + sink = PyTaintSink( + sink_type=row["sink_type"], + call_site=sink_call_site, + severity=row["severity"], + description=f"Potential {row['vulnerability_type']} vulnerability " + f"in {row['sink_qualified_function']} " + f"({row['sink_file']}:{row['sink_start_line']})", + ) + sinks_dict[sink_key] = sink + + flow = PyTaintFlow( + flow_id=row["flow_id"], + source=sources_dict[source_key], + sink=sinks_dict[sink_key], + path=[], + vulnerability_type=row["vulnerability_type"], + severity=row["severity"], + confidence="medium", + description=row["message"], + ) + flows.append(flow) + + n_critical = sum(1 for f in flows if f.severity == "critical") + n_high = sum(1 for f in flows if f.severity == "high") + logger.info(f"Taint analysis complete: {len(flows)} flows, " + f"{n_critical} critical, {n_high} high") + if callsite_index: + logger.debug(f"Symbol-table callsite linkage: {n_callsite_hits} of " + f"{len(sources_dict) + len(sinks_dict)} source/sink nodes " + f"resolved to existing PyCallsite objects") + + return PyTaintAnalysisResult( + project_path=str(self.project_dir), + flows=flows, + analysis_timestamp=datetime.now(timezone.utc).isoformat(), + codeql_database_path=str(self.db_path), + ) diff --git a/codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py b/codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py index 17eb368..e23035e 100644 --- a/codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py +++ b/codeanalyzer/semantic_analysis/codeql/codeql_query_runner.py @@ -63,6 +63,7 @@ def __init__(self, database_path: str, codeql_bin=None, codeql_packs_dir=None): Path(codeql_packs_dir) if codeql_packs_dir is not None else None ) self.temp_file_path: Path = None + self._temp_qlpack_dir: "tempfile.TemporaryDirectory | None" = None def __enter__(self): """Context entry that prepares paths to execute a CodeQL query. diff --git a/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py new file mode 100644 index 0000000..0b985cb --- /dev/null +++ b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py @@ -0,0 +1,428 @@ +################################################################################ +# Copyright IBM Corporation 2025 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +"""Dynamic CodeQL query generator for taint analysis. + +This module generates CodeQL queries from taint analysis configurations. + +Design philosophy +----------------- +CodeQL's ``codeql/python-all`` pack ships comprehensive built-in taint models +via ``semmle.python.security.dataflow.*`` — these cover hundreds of SQL, +command, path-traversal, XSS, and other sinks automatically, without any +manual API enumeration. + +The generated query therefore uses **two complementary layers**: + +1. **Built-in CodeQL security models** (primary, comprehensive): + - ``RemoteFlowSource`` — all web-framework request sources (Flask, Django, + FastAPI, aiohttp, …) recognised by CodeQL out of the box. + - ``SqlInjection::Sink`` — all DB cursor patterns (sqlite3, psycopg2, + mysql-connector, SQLAlchemy, …). + - ``CommandInjection::Sink`` — subprocess, os.system, shlex, … + - ``CodeInjection::Sink`` — eval, exec, compile, … + - ``PathTraversal::Sink`` — open(), os.path operations, … + - ``XSS::Sink`` — Flask/Django template rendering, … + +2. **Configurable user-defined patterns** (supplementary): + Additional sources/sinks/sanitizers supplied via ``TaintAnalysisConfig`` + that extend the built-in coverage with project-specific APIs. + +Uses the modern CodeQL Python API (codeql/python-all >= 7.x): +- ``DataFlow::ConfigSig`` interface with ``implements`` +- ``TaintTracking::Global`` module +- ``API::Node.asSource()`` / ``API::Node.getParameter(N).asSink()`` +""" + +from typing import List +from codeanalyzer.schema.py_schema import ( + TaintAnalysisConfig, + TaintSourceConfig, + TaintSinkConfig, + TaintSanitizerConfig, +) + + +class TaintQueryGenerator: + """Generates CodeQL queries from taint analysis configuration.""" + + @staticmethod + def generate_query(config: TaintAnalysisConfig) -> str: + """Generate complete taint analysis CodeQL query from configuration. + + The query combines CodeQL's built-in security models with any + user-configured patterns, giving comprehensive coverage without + requiring exhaustive manual API enumeration. + + Args: + config: Taint analysis configuration + + Returns: + str: Complete CodeQL query ready for execution + """ + query_parts = [] + + query_parts.append(TaintQueryGenerator._generate_header()) + query_parts.append(TaintQueryGenerator._generate_imports()) + query_parts.append(TaintQueryGenerator._generate_source_predicate(config.sources)) + query_parts.append(TaintQueryGenerator._generate_sink_predicate(config.sinks)) + + if config.sanitizers: + query_parts.append(TaintQueryGenerator._generate_sanitizer_predicate(config.sanitizers)) + + query_parts.append(TaintQueryGenerator._generate_config_sig( + has_sanitizers=len(config.sanitizers) > 0 + )) + query_parts.append(TaintQueryGenerator._generate_flow_module()) + query_parts.append(TaintQueryGenerator._generate_helpers()) + query_parts.append(TaintQueryGenerator._generate_main_query()) + + return "\n\n".join(query_parts) + + # ------------------------------------------------------------------ + # Header / imports + # ------------------------------------------------------------------ + + @staticmethod + def _generate_header() -> str: + """Generate query header with metadata.""" + return """/** + * @name Configurable Taint Analysis + * @description Taint analysis combining CodeQL built-in security models with + * configurable user-defined sources, sinks, and sanitizers. + * @kind path-problem + * @id python/configurable-taint-analysis + * @problem.severity warning + */""" + + @staticmethod + def _generate_imports() -> str: + """Generate import statements. + + Imports both the core DataFlow/TaintTracking modules and the built-in + security-sink/source classes from codeql/python-all so that the query + benefits from CodeQL's comprehensive model library. + + Module names verified against codeql/python-all 7.x: + - SqlInjectionCustomizations → module SqlInjection { class Sink } + - CommandInjectionCustomizations → module CommandInjection { class Sink } + - CodeInjectionCustomizations → module CodeInjection { class Sink } + - PathInjectionCustomizations → module PathInjection { class Sink } + - ReflectedXSSCustomizations → module ReflectedXss { class Sink } + - RemoteFlowSources → class RemoteFlowSource + """ + return """import python +import semmle.python.dataflow.new.DataFlow +import semmle.python.dataflow.new.TaintTracking +import semmle.python.ApiGraphs +import semmle.python.security.dataflow.SqlInjectionCustomizations +import semmle.python.security.dataflow.CommandInjectionCustomizations +import semmle.python.security.dataflow.CodeInjectionCustomizations +import semmle.python.security.dataflow.PathInjectionCustomizations +import semmle.python.security.dataflow.ReflectedXSSCustomizations +import semmle.python.dataflow.new.RemoteFlowSources""" + + # ------------------------------------------------------------------ + # Pattern helpers + # ------------------------------------------------------------------ + + @staticmethod + def _pattern_to_source_node(pattern: str) -> str: + """Convert a pattern string to a DataFlow::Node expression for sources.""" + if pattern.endswith(".getACall()"): + return pattern + return f"{pattern}.asSource()" + + @staticmethod + def _pattern_to_sink_node(pattern: str, argument_index: int) -> str: + """Convert a pattern string to a DataFlow::Node expression for sinks.""" + if pattern.endswith(".getACall()"): + api_node = pattern[:-len(".getACall()")] + return f"{api_node}.getParameter({argument_index}).asSink()" + return f"{pattern}.getParameter({argument_index}).asSink()" + + @staticmethod + def _pattern_to_sanitizer_node(pattern: str) -> str: + """Convert a pattern string to a DataFlow::Node expression for sanitizers.""" + if pattern.endswith(".getACall()"): + return pattern + return f"{pattern}.asSource()" + + # ------------------------------------------------------------------ + # Predicate generators + # ------------------------------------------------------------------ + + @staticmethod + def _generate_source_predicate(sources: List[TaintSourceConfig]) -> str: + """Generate isSource predicate combining built-in RemoteFlowSource with + any user-configured sources. + + Built-in ``RemoteFlowSource`` covers all web-framework request inputs + (Flask ``request.args/form/json``, Django ``request.GET/POST``, + FastAPI, aiohttp, Tornado, …) recognised by CodeQL's model library. + User-configured patterns extend this with project-specific sources + (e.g. ``sys.argv``, ``input()``, custom HTTP clients). + """ + lines = [ + "predicate isConfiguredSource(DataFlow::Node node, string sourceType) {", + " // Built-in: all web-framework request sources recognised by CodeQL", + " (node instanceof RemoteFlowSource and sourceType = \"web_request\")", + ] + + for source in sources: + lines.append(" or") + lines.append(f" // User-configured: {source.description}") + node_expr = TaintQueryGenerator._pattern_to_source_node(source.pattern) + lines.append(f" (node = {node_expr} and sourceType = \"{source.source_type}\")") + + lines.append("}") + return "\n".join(lines) + + @staticmethod + def _generate_sink_predicate(sinks: List[TaintSinkConfig]) -> str: + """Generate isSink predicate combining built-in security sinks with + any user-configured sinks. + + Built-in sink classes from ``codeql/python-all`` cover: + - ``SqlInjection::Sink`` — sqlite3, psycopg2, mysql-connector, + SQLAlchemy, Django ORM raw queries, … + - ``CommandInjection::Sink`` — subprocess.*, os.system, os.popen, … + - ``CodeInjection::Sink`` — eval(), exec(), compile(), … + - ``PathTraversal::Sink`` — open(), os.path.*, pathlib.Path.open(), … + - ``XSS::Sink`` — Flask/Django template rendering, … + + User-configured patterns extend this with project-specific sinks. + """ + lines = [ + "predicate isConfiguredSink(DataFlow::Node node, string sinkType, string severity, string vulnerabilityType) {", + " // Built-in: SQL injection sinks (sqlite3, psycopg2, SQLAlchemy, Django ORM raw, …)", + " (node instanceof SqlInjection::Sink and", + " sinkType = \"sql_execution\" and severity = \"critical\" and vulnerabilityType = \"SQL Injection\")", + " or", + " // Built-in: Command injection sinks (subprocess.*, os.system, os.popen, …)", + " (node instanceof CommandInjection::Sink and", + " sinkType = \"command_execution\" and severity = \"critical\" and vulnerabilityType = \"Command Injection\")", + " or", + " // Built-in: Code injection sinks (eval, exec, compile, …)", + " (node instanceof CodeInjection::Sink and", + " sinkType = \"code_execution\" and severity = \"critical\" and vulnerabilityType = \"Code Injection\")", + " or", + " // Built-in: Path injection sinks (open, os.path.*, pathlib.Path.open, …)", + " (node instanceof PathInjection::Sink and", + " sinkType = \"file_access\" and severity = \"high\" and vulnerabilityType = \"Path Traversal\")", + " or", + " // Built-in: Reflected XSS sinks (Flask/Django template rendering, …)", + " (node instanceof ReflectedXss::Sink and", + " sinkType = \"template_rendering\" and severity = \"high\" and vulnerabilityType = \"Cross-Site Scripting (XSS)\")", + ] + + for sink in sinks: + lines.append(" or") + lines.append(f" // User-configured: {sink.description}") + + if sink.argument_index is not None: + node_expr = TaintQueryGenerator._pattern_to_sink_node(sink.pattern, sink.argument_index) + else: + node_expr = TaintQueryGenerator._pattern_to_source_node(sink.pattern) + + lines.append(" (") + lines.append(f" node = {node_expr} and") + lines.append(f" sinkType = \"{sink.sink_type}\" and") + lines.append(f" severity = \"{sink.severity}\" and") + lines.append(f" vulnerabilityType = \"{sink.vulnerability_type}\"") + lines.append(" )") + + lines.append("}") + return "\n".join(lines) + + @staticmethod + def _generate_sanitizer_predicate(sanitizers: List[TaintSanitizerConfig]) -> str: + """Generate isConfiguredSanitizer predicate from configuration.""" + lines = [ + "predicate isConfiguredSanitizer(DataFlow::Node node) {", + ] + + for i, sanitizer in enumerate(sanitizers): + if i > 0: + lines.append(" or") + lines.append(f" // {sanitizer.description}") + node_expr = TaintQueryGenerator._pattern_to_sanitizer_node(sanitizer.pattern) + lines.append(f" node = {node_expr}") + + lines.append("}") + return "\n".join(lines) + + @staticmethod + def _generate_config_sig(has_sanitizers: bool) -> str: + """Generate DataFlow::ConfigSig module using modern CodeQL API.""" + lines = [ + "private module ConfiguredTaintConfig implements DataFlow::ConfigSig {", + " predicate isSource(DataFlow::Node source) {", + " isConfiguredSource(source, _)", + " }", + "", + " predicate isSink(DataFlow::Node sink) {", + " isConfiguredSink(sink, _, _, _)", + " }", + ] + + if has_sanitizers: + lines.extend([ + "", + " predicate isBarrier(DataFlow::Node node) {", + " isConfiguredSanitizer(node)", + " }", + ]) + + lines.extend([ + "", + " predicate observeDiffInformedIncrementalMode() { any() }", + "}", + ]) + + return "\n".join(lines) + + @staticmethod + def _generate_flow_module() -> str: + """Generate TaintTracking::Global module instantiation.""" + return "module ConfiguredTaintFlow = TaintTracking::Global;" + + @staticmethod + def _generate_helpers() -> str: + """Generate helper functions for extracting metadata.""" + return """string getFunctionName(DataFlow::Node node) { + result = node.getScope().(Function).getName() + or + not exists(node.getScope().(Function)) and result = "" +} + +string getQualifiedFunctionName(DataFlow::Node node) { + exists(Function f | + f = node.getScope() | + if exists(f.getScope().(Class)) then + result = f.getScope().(Class).getName() + "." + f.getName() + else + result = f.getName() + ) + or + not exists(node.getScope().(Function)) and result = "" +}""" + + @staticmethod + def _generate_main_query() -> str: + """Generate main query select statement using modern path-problem API.""" + return """import ConfiguredTaintFlow::PathGraph + +from + ConfiguredTaintFlow::PathNode source, + ConfiguredTaintFlow::PathNode sink, + string sourceType, + string sinkType, + string severity, + string vulnerabilityType +where + ConfiguredTaintFlow::flowPath(source, sink) and + isConfiguredSource(source.getNode(), sourceType) and + isConfiguredSink(sink.getNode(), sinkType, severity, vulnerabilityType) +select + // 1. Element (sink - required for path-problem) + sink.getNode(), + // 2. Source path node (required for path-problem) + source, + // 3. Sink path node (required for path-problem) + sink, + // 4. Message (required for path-problem) + "Tainted data from " + sourceType + " flows to " + vulnerabilityType, + + // Additional metadata columns + // Flow ID + source.getNode().getLocation().getFile().getAbsolutePath() + ":" + + source.getNode().getLocation().getStartLine().toString() + "->" + + sink.getNode().getLocation().getFile().getAbsolutePath() + ":" + + sink.getNode().getLocation().getStartLine().toString(), + + // Source information + source.getNode().getLocation().getFile().getAbsolutePath(), + source.getNode().getLocation().getStartLine(), + source.getNode().getLocation().getEndLine(), + source.getNode().getLocation().getStartColumn(), + source.getNode().getLocation().getEndColumn(), + sourceType, + source.getNode().toString(), + getFunctionName(source.getNode()), + getQualifiedFunctionName(source.getNode()), + + // Sink information + sink.getNode().getLocation().getFile().getAbsolutePath(), + sink.getNode().getLocation().getStartLine(), + sink.getNode().getLocation().getEndLine(), + sink.getNode().getLocation().getStartColumn(), + sink.getNode().getLocation().getEndColumn(), + sinkType, + severity, + sink.getNode().toString(), + getFunctionName(sink.getNode()), + getQualifiedFunctionName(sink.getNode()), + vulnerabilityType, + // Confidence (always medium for configurable analysis) + "medium" """ + + @staticmethod + def get_column_names() -> List[str]: + """Get the column names for the query results. + + Column order matches the select statement: + 1. element (sink node - required for path-problem) + 2. source_path (PathNode - required for path-problem) + 3. sink_path (PathNode - required for path-problem) + 4. message (string - required for path-problem) + 5+ additional metadata columns + + Returns: + List[str]: Column names in the order they appear in the query + """ + return [ + # Required path-problem columns (positions 1-4) + "element", + "source_path", + "sink_path", + "message", + # Additional metadata + "flow_id", + # Source columns + "source_file", + "source_start_line", + "source_end_line", + "source_start_col", + "source_end_col", + "source_type", + "source_expr", + "source_function", + "source_qualified_function", + # Sink columns + "sink_file", + "sink_start_line", + "sink_end_line", + "sink_start_col", + "sink_end_col", + "sink_type", + "severity", + "sink_expr", + "sink_function", + "sink_qualified_function", + "vulnerability_type", + "confidence", + ] diff --git a/test/conftest.py b/test/conftest.py index 9af14d4..35043e9 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,4 +1,6 @@ # conftest.py +import shutil +import subprocess from pathlib import Path import pytest @@ -34,3 +36,156 @@ def whole_applications__xarray() -> Path: def single_functionalities__stuff_nested_in_functions() -> Path: """Returns the path to the 'single_functionalities/stuff_nested_in_functions' directory.""" return Path(__file__).parent.resolve().joinpath("fixtures", "single_functionalities", "stuff_nested_in_functions_test") + + +# ============================================================================ +# Taint Analysis CodeQL Database Fixtures +# ============================================================================ + +_TAINT_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "taint_analysis" + +_TAINT_FIXTURE_APPS = { + "sql_injection": _TAINT_FIXTURES_DIR / "sql_injection_app", + "command_injection": _TAINT_FIXTURES_DIR / "command_injection_app", + "path_traversal": _TAINT_FIXTURES_DIR / "path_traversal_app", + "xss": _TAINT_FIXTURES_DIR / "xss_app", + "flask": _TAINT_FIXTURES_DIR / "flask_app", + "sanitizer": _TAINT_FIXTURES_DIR / "sanitizer_app", +} + + +def _codeql_available() -> bool: + """Check if CodeQL CLI is available.""" + return shutil.which("codeql") is not None + + +def _create_codeql_database(source_dir: Path, db_path: Path) -> bool: + """Create a CodeQL database for a Python source directory.""" + cmd = [ + "codeql", "database", "create", str(db_path), + f"--source-root={source_dir}", + "--language=python", + "--overwrite", + ] + result = subprocess.run(cmd, capture_output=True, text=True) + return result.returncode == 0 + + +@pytest.fixture(scope="session") +def codeql_databases(tmp_path_factory): + """Session-scoped fixture that creates CodeQL databases for all taint fixture apps. + + Databases are created once per test session and shared across all tests. + If CodeQL is not available, returns None and dependent tests will be skipped. + + Returns: + dict: Map of fixture name -> database path, or None if CodeQL unavailable + """ + if not _codeql_available(): + return None + + db_base = tmp_path_factory.mktemp("codeql_dbs") + databases = {} + + for name, source_dir in _TAINT_FIXTURE_APPS.items(): + db_path = db_base / f"{name}_db" + if _create_codeql_database(source_dir, db_path): + databases[name] = db_path + else: + databases[name] = None + + return databases + + +@pytest.fixture(scope="session") +def sql_injection_db(codeql_databases): + """Session-scoped CodeQL database for SQL injection fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("sql_injection") + if db is None: + pytest.skip("Failed to create SQL injection CodeQL database") + return db + + +@pytest.fixture(scope="session") +def command_injection_db(codeql_databases): + """Session-scoped CodeQL database for command injection fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("command_injection") + if db is None: + pytest.skip("Failed to create command injection CodeQL database") + return db + + +@pytest.fixture(scope="session") +def path_traversal_db(codeql_databases): + """Session-scoped CodeQL database for path traversal fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("path_traversal") + if db is None: + pytest.skip("Failed to create path traversal CodeQL database") + return db + + +@pytest.fixture(scope="session") +def xss_db(codeql_databases): + """Session-scoped CodeQL database for XSS fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("xss") + if db is None: + pytest.skip("Failed to create XSS CodeQL database") + return db + + +@pytest.fixture(scope="session") +def flask_db(codeql_databases): + """Session-scoped CodeQL database for Flask fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("flask") + if db is None: + pytest.skip("Failed to create Flask CodeQL database") + return db + + +@pytest.fixture(scope="session") +def sanitizer_db(codeql_databases): + """Session-scoped CodeQL database for sanitizer fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("sanitizer") + if db is None: + pytest.skip("Failed to create sanitizer CodeQL database") + return db + + +@pytest.fixture(scope="session") +def codeql_packs_dir(tmp_path_factory): + """Session-scoped fixture that installs a qlpack with codeql/python-all once. + + Returns the pack directory path, or None if CodeQL is unavailable. + Tests that need this should skip when it returns None. + """ + if not _codeql_available(): + return None + + pack_dir = tmp_path_factory.mktemp("codeql_qlpack") + qlpack_yml = pack_dir / "qlpack.yml" + qlpack_yml.write_text( + "name: codeanalyzer-test-pack\n" + "version: 1.0.0\n" + "dependencies:\n" + ' "codeql/python-all": "*"\n' + ) + result = subprocess.run( + ["codeql", "pack", "install", str(pack_dir)], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return None + return pack_dir diff --git a/test/fixtures/taint_analysis/command_injection_app/vulnerable.py b/test/fixtures/taint_analysis/command_injection_app/vulnerable.py new file mode 100644 index 0000000..1dd5972 --- /dev/null +++ b/test/fixtures/taint_analysis/command_injection_app/vulnerable.py @@ -0,0 +1,172 @@ +""" +Command Injection vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import os +import subprocess +import sys + + +def vulnerable_os_system(filename): + """Command injection via os.system.""" + # VULNERABLE: User input directly in shell command + os.system("cat " + filename) + + +def vulnerable_subprocess_shell(user_input): + """Command injection via subprocess with shell=True.""" + # VULNERABLE: shell=True with user input + subprocess.call("ls -la " + user_input, shell=True) + + +def vulnerable_popen(command): + """Command injection via os.popen.""" + # VULNERABLE: User input in os.popen + result = os.popen("echo " + command).read() + return result + + +def vulnerable_from_argv(): + """Command injection from command-line arguments.""" + if len(sys.argv) > 1: + directory = sys.argv[1] + # VULNERABLE: Command-line arg in shell command + os.system(f"ls -la {directory}") + + +def vulnerable_from_input(): + """Command injection from user input.""" + filename = input("Enter filename to display: ") + # VULNERABLE: User input in shell command + subprocess.run(f"cat {filename}", shell=True) + + +def vulnerable_eval(user_code): + """Code injection via eval.""" + # VULNERABLE: eval with user input + result = eval(user_code) + return result + + +def vulnerable_exec(user_code): + """Code injection via exec.""" + # VULNERABLE: exec with user input + exec(user_code) + + +def safe_subprocess_no_shell(filename): + """Safe subprocess call without shell.""" + # SAFE: No shell, arguments as list + subprocess.run(["cat", filename]) + + +def safe_subprocess_with_sanitization(filename): + """Safe subprocess with input validation.""" + # SAFE: Input validation + import shlex + safe_filename = shlex.quote(filename) + subprocess.run(f"cat {safe_filename}", shell=True) + + +# Inter-procedural taint flow examples +def get_command_from_user(): + """Source: Get command from user.""" + return input("Enter command: ") + + +def build_shell_command(cmd): + """Intermediate: Build shell command.""" + return "ls -la " + cmd + + +def execute_shell_command(command): + """Sink: Execute shell command.""" + os.system(command) + + +def vulnerable_interprocedural(): + """Vulnerable code with taint flow across functions.""" + # Source -> Intermediate -> Sink + user_cmd = get_command_from_user() + full_cmd = build_shell_command(user_cmd) + execute_shell_command(full_cmd) + + +class CommandExecutor: + """Class with vulnerable methods demonstrating inter-method taint flow.""" + + def get_directory_from_args(self): + """Source: Get directory from command-line.""" + return sys.argv[1] if len(sys.argv) > 1 else "/tmp" + + def prepare_command(self, directory): + """Intermediate: Prepare command with tainted data.""" + return f"find {directory} -name '*.txt'" + + def run_command(self, command): + """Sink: Execute command.""" + return subprocess.check_output(command, shell=True) + + def vulnerable_find_files(self): + """Vulnerable method with taint flow across class methods.""" + # Source -> Intermediate -> Sink within class + directory = self.get_directory_from_args() + command = self.prepare_command(directory) + return self.run_command(command) + + +def sanitize_input(user_input): + """Intermediate function that doesn't properly sanitize.""" + # This doesn't actually sanitize for command injection + return user_input.replace(";", "").replace("&", "") + + +def vulnerable_with_weak_sanitization(): + """Vulnerable code with weak sanitization.""" + # Source + user_input = input("Enter filename: ") + # Weak sanitization (still tainted) + sanitized = sanitize_input(user_input) + # Sink + os.system("cat " + sanitized) + + +def get_code_from_file(filename): + """Source: Read code from file.""" + with open(filename, 'r') as f: + return f.read() + + +def vulnerable_eval_from_file(): + """Vulnerable eval with code from file.""" + # Source + code = get_code_from_file(sys.argv[1] if len(sys.argv) > 1 else "input.txt") + # Sink + eval(code) + + +def main(): + """Main function demonstrating vulnerabilities.""" + # Direct vulnerabilities + vulnerable_os_system(sys.argv[1] if len(sys.argv) > 1 else "/etc/passwd") + vulnerable_subprocess_shell(input("Enter directory: ")) + vulnerable_popen(input("Enter command: ")) + vulnerable_eval(input("Enter expression: ")) + + # Inter-procedural vulnerabilities + vulnerable_interprocedural() + + # Class-based vulnerabilities + executor = CommandExecutor() + executor.vulnerable_find_files() + + # Vulnerability with weak sanitization + vulnerable_with_weak_sanitization() + + # Safe examples + safe_subprocess_no_shell("/etc/passwd") + + +if __name__ == "__main__": + main() diff --git a/test/fixtures/taint_analysis/flask_app/vulnerable.py b/test/fixtures/taint_analysis/flask_app/vulnerable.py new file mode 100644 index 0000000..df3b4ce --- /dev/null +++ b/test/fixtures/taint_analysis/flask_app/vulnerable.py @@ -0,0 +1,231 @@ +""" +Flask web application with taint vulnerabilities. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +try: + from flask import Flask, request, render_template_string + import sqlite3 + import os + + app = Flask(__name__) + + + @app.route('/search') + def vulnerable_search(): + """SQL injection in search endpoint.""" + query = request.args.get('q', '') + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + # VULNERABLE: User input from request.args in SQL query + sql = f"SELECT * FROM products WHERE name LIKE '%{query}%'" + cursor.execute(sql) + results = cursor.fetchall() + return str(results) + + + @app.route('/user/') + def vulnerable_user_profile(username): + """XSS in user profile.""" + # VULNERABLE: User input from URL parameter in HTML + html = f"

Profile: {username}

" + return html + + + @app.route('/execute') + def vulnerable_execute(): + """Command injection in execute endpoint.""" + cmd = request.args.get('cmd', '') + # VULNERABLE: User input from request.args in shell command + result = os.popen(cmd).read() + return result + + + @app.route('/file') + def vulnerable_file_read(): + """Path traversal in file read.""" + filename = request.args.get('name', '') + # VULNERABLE: User input from request.args in file path + with open(f"/var/www/files/{filename}", 'r') as f: + return f.read() + + + @app.route('/template') + def vulnerable_template(): + """Server-Side Template Injection.""" + template = request.args.get('tmpl', '') + # VULNERABLE: User input in template rendering + return render_template_string(template) + + + @app.route('/login', methods=['POST']) + def vulnerable_login(): + """SQL injection in login form.""" + username = request.form.get('username', '') + password = request.form.get('password', '') + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + # VULNERABLE: User input from request.form in SQL query + sql = f"SELECT * FROM users WHERE username='{username}' AND password='{password}'" + cursor.execute(sql) + user = cursor.fetchone() + return "Login successful" if user else "Login failed" + + + # Inter-procedural taint flow examples + def get_search_query(): + """Source: Get search query from request.""" + return request.args.get('q', '') + + + def build_search_sql(query): + """Intermediate: Build SQL query.""" + return f"SELECT * FROM products WHERE name LIKE '%{query}%'" + + + def execute_sql(sql): + """Sink: Execute SQL query.""" + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + cursor.execute(sql) + return cursor.fetchall() + + + @app.route('/search_v2') + def vulnerable_search_interprocedural(): + """SQL injection with inter-procedural taint flow.""" + # Source -> Intermediate -> Sink + query = get_search_query() + sql = build_search_sql(query) + results = execute_sql(sql) + return str(results) + + + class UserService: + """Service class with vulnerable methods.""" + + def get_user_id_from_request(self): + """Source: Get user ID from request.""" + return request.args.get('id', '') + + def format_user_query(self, user_id): + """Intermediate: Format user query.""" + return f"SELECT * FROM users WHERE id = {user_id}" + + def fetch_user(self, query): + """Sink: Execute user query.""" + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + cursor.execute(query) + return cursor.fetchone() + + def get_user_info(self): + """Vulnerable method with taint flow across class methods.""" + user_id = self.get_user_id_from_request() + query = self.format_user_query(user_id) + return self.fetch_user(query) + + + user_service = UserService() + + + @app.route('/user_info') + def vulnerable_user_info(): + """SQL injection via service class.""" + user = user_service.get_user_info() + return str(user) + + + @app.route('/safe_search') + def safe_search(): + """Safe search with parameterized query.""" + query = request.args.get('q', '') + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + # SAFE: Parameterized query + sql = "SELECT * FROM products WHERE name LIKE ?" + cursor.execute(sql, (f'%{query}%',)) + results = cursor.fetchall() + return str(results) + + + if __name__ == '__main__': + app.run(debug=True) + +except ImportError: + # Flask not installed, create dummy functions for analysis + import sqlite3 + import os + import sys + + class Request: + """Mock request object.""" + def __init__(self): + self.args = {'q': '', 'id': '', 'cmd': '', 'name': '', 'tmpl': ''} + self.form = {'username': '', 'password': ''} + + def get(self, key, default=''): + return self.args.get(key, default) + + request = Request() + + + def vulnerable_search(): + """SQL injection in search endpoint.""" + query = request.args.get('q', '') + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + sql = f"SELECT * FROM products WHERE name LIKE '%{query}%'" + cursor.execute(sql) + return cursor.fetchall() + + + def get_search_query(): + """Source: Get search query from request.""" + return request.args.get('q', '') + + + def build_search_sql(query): + """Intermediate: Build SQL query.""" + return f"SELECT * FROM products WHERE name LIKE '%{query}%'" + + + def execute_sql(sql): + """Sink: Execute SQL query.""" + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + cursor.execute(sql) + return cursor.fetchall() + + + def vulnerable_search_interprocedural(): + """SQL injection with inter-procedural taint flow.""" + query = get_search_query() + sql = build_search_sql(query) + results = execute_sql(sql) + return results + + + class UserService: + """Service class with vulnerable methods.""" + + def get_user_id_from_request(self): + """Source: Get user ID from request.""" + return request.args.get('id', '') + + def format_user_query(self, user_id): + """Intermediate: Format user query.""" + return f"SELECT * FROM users WHERE id = {user_id}" + + def fetch_user(self, query): + """Sink: Execute user query.""" + conn = sqlite3.connect('app.db') + cursor = conn.cursor() + cursor.execute(query) + return cursor.fetchone() + + def get_user_info(self): + """Vulnerable method with taint flow across class methods.""" + user_id = self.get_user_id_from_request() + query = self.format_user_query(user_id) + return self.fetch_user(query) diff --git a/test/fixtures/taint_analysis/path_traversal_app/vulnerable.py b/test/fixtures/taint_analysis/path_traversal_app/vulnerable.py new file mode 100644 index 0000000..d36735a --- /dev/null +++ b/test/fixtures/taint_analysis/path_traversal_app/vulnerable.py @@ -0,0 +1,189 @@ +""" +Path Traversal vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import os +import sys + + +def vulnerable_open_direct(filename): + """Path traversal via direct file open.""" + # VULNERABLE: User input directly in file path + with open("/var/www/uploads/" + filename, 'r') as f: + return f.read() + + +def vulnerable_open_fstring(filename): + """Path traversal via f-string.""" + # VULNERABLE: f-string with user input + with open(f"/var/www/uploads/{filename}", 'r') as f: + return f.read() + + +def vulnerable_from_argv(): + """Path traversal from command-line arguments.""" + if len(sys.argv) > 1: + filepath = sys.argv[1] + # VULNERABLE: Command-line arg in file path + with open(filepath, 'r') as f: + print(f.read()) + + +def vulnerable_from_input(): + """Path traversal from user input.""" + filename = input("Enter filename to read: ") + # VULNERABLE: User input in file path + with open("/var/www/data/" + filename, 'r') as f: + return f.read() + + +def vulnerable_os_path_join(user_path): + """Path traversal via os.path.join.""" + # VULNERABLE: os.path.join doesn't prevent traversal + full_path = os.path.join("/var/www/uploads", user_path) + with open(full_path, 'r') as f: + return f.read() + + +def vulnerable_write_file(filename, content): + """Path traversal in file write.""" + # VULNERABLE: User input in write path + with open("/var/www/uploads/" + filename, 'w') as f: + f.write(content) + + +def safe_with_normalization(filename): + """Safe file access with path normalization.""" + # SAFE: Path normalization and validation + base_dir = "/var/www/uploads" + full_path = os.path.normpath(os.path.join(base_dir, filename)) + + # Ensure the path is within base_dir + if not full_path.startswith(base_dir): + raise ValueError("Invalid file path") + + with open(full_path, 'r') as f: + return f.read() + + +def safe_with_basename(filename): + """Safe file access using basename.""" + # SAFE: Only use basename, preventing directory traversal + safe_filename = os.path.basename(filename) + with open(f"/var/www/uploads/{safe_filename}", 'r') as f: + return f.read() + + +# Inter-procedural taint flow examples +def get_filename_from_user(): + """Source: Get filename from user.""" + return input("Enter filename: ") + + +def construct_file_path(filename): + """Intermediate: Construct file path.""" + return "/var/www/uploads/" + filename + + +def read_file_content(filepath): + """Sink: Read file content.""" + with open(filepath, 'r') as f: + return f.read() + + +def vulnerable_interprocedural(): + """Vulnerable code with taint flow across functions.""" + # Source -> Intermediate -> Sink + filename = get_filename_from_user() + filepath = construct_file_path(filename) + content = read_file_content(filepath) + return content + + +class FileManager: + """Class with vulnerable methods demonstrating inter-method taint flow.""" + + def __init__(self, base_dir="/var/www/data"): + self.base_dir = base_dir + + def get_filename_from_args(self): + """Source: Get filename from command-line.""" + return sys.argv[1] if len(sys.argv) > 1 else "default.txt" + + def build_path(self, filename): + """Intermediate: Build file path with tainted data.""" + return self.base_dir + "/" + filename + + def read_file(self, filepath): + """Sink: Read file.""" + with open(filepath, 'r') as f: + return f.read() + + def vulnerable_read(self): + """Vulnerable method with taint flow across class methods.""" + # Source -> Intermediate -> Sink within class + filename = self.get_filename_from_args() + filepath = self.build_path(filename) + return self.read_file(filepath) + + +def process_filename(filename): + """Intermediate function that processes filename.""" + # Remove leading/trailing whitespace but doesn't prevent traversal + return filename.strip() + + +def vulnerable_with_processing(): + """Vulnerable code with filename processing.""" + # Source + raw_filename = input("Enter filename: ") + # Processing (still tainted) + processed = process_filename(raw_filename) + # Sink + with open("/var/www/uploads/" + processed, 'r') as f: + return f.read() + + +def get_path_from_config(): + """Source: Get path from configuration file.""" + # Simulating reading from a config file + return sys.argv[1] if len(sys.argv) > 1 else "../../../etc/passwd" + + +def vulnerable_from_config(): + """Vulnerable code with path from config.""" + # Source + filepath = get_path_from_config() + # Sink + with open(filepath, 'r') as f: + return f.read() + + +def main(): + """Main function demonstrating vulnerabilities.""" + # Direct vulnerabilities + vulnerable_open_direct(sys.argv[1] if len(sys.argv) > 1 else "../../etc/passwd") + vulnerable_open_fstring(input("Enter filename: ")) + vulnerable_os_path_join(input("Enter path: ")) + + # Inter-procedural vulnerabilities + vulnerable_interprocedural() + + # Class-based vulnerabilities + fm = FileManager() + fm.vulnerable_read() + + # Vulnerability with processing + vulnerable_with_processing() + + # Vulnerability from config + vulnerable_from_config() + + # Safe examples + safe_with_normalization("safe_file.txt") + safe_with_basename("../../../etc/passwd") # Will only use "passwd" + + +if __name__ == "__main__": + main() diff --git a/test/fixtures/taint_analysis/sanitizer_app/mixed.py b/test/fixtures/taint_analysis/sanitizer_app/mixed.py new file mode 100644 index 0000000..4d9e587 --- /dev/null +++ b/test/fixtures/taint_analysis/sanitizer_app/mixed.py @@ -0,0 +1,114 @@ +""" +Test application with both vulnerable and safe code. +This demonstrates the difference between sanitized and unsanitized flows. +""" + +import sqlite3 +import sys +from html import escape + + +# Vulnerable: No sanitizer +def vulnerable_no_sanitizer(): + """Vulnerable code without sanitizer.""" + user_input = input("Enter username: ") + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # VULNERABLE: No sanitization + query = f"SELECT * FROM users WHERE username = '{user_input}'" + cursor.execute(query) + return cursor.fetchall() + + +# Safe: With sanitizer +def safe_with_sanitizer(): + """Safe code with sanitizer.""" + user_input = input("Enter username: ") + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # SAFE: Parameterized query (sanitizer) + query = "SELECT * FROM users WHERE username = ?" + cursor.execute(query, (user_input,)) + return cursor.fetchall() + + +# Vulnerable: Weak sanitization +def weak_sanitize(user_input): + """Weak sanitizer that doesn't fully protect.""" + # This only removes single quotes, but doesn't prevent all SQL injection + return user_input.replace("'", "") + + +def vulnerable_weak_sanitizer(): + """Vulnerable code with weak sanitization.""" + user_input = input("Enter user ID: ") + # Weak sanitization + sanitized = weak_sanitize(user_input) + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # STILL VULNERABLE: Weak sanitization doesn't prevent numeric injection + query = f"SELECT * FROM users WHERE id = {sanitized}" + cursor.execute(query) + return cursor.fetchall() + + +# Safe: Strong sanitization +def strong_sanitize_html(content): + """Strong HTML sanitizer.""" + return escape(content) + + +def safe_strong_sanitizer(): + """Safe code with strong sanitization.""" + user_content = input("Enter content: ") + # Strong sanitization + safe_content = strong_sanitize_html(user_content) + # SAFE: Content is properly escaped + html = f"
{safe_content}
" + return html + + +# Vulnerable: Sanitizer bypassed +def bypass_sanitizer(): + """Vulnerable code where sanitizer is bypassed.""" + user_input = input("Enter username: ") + + # Sanitizer exists but is not used + def unused_sanitizer(text): + return escape(text) + + # VULNERABLE: Sanitizer defined but not called + html = f"

Welcome, {user_input}!

" + return html + + +# Safe: Sanitizer properly applied +def proper_sanitizer_usage(): + """Safe code with properly applied sanitizer.""" + user_input = input("Enter username: ") + + # Sanitizer is defined + def html_sanitizer(text): + return escape(text) + + # SAFE: Sanitizer is actually used + safe_input = html_sanitizer(user_input) + html = f"

Welcome, {safe_input}!

" + return html + + +def main(): + """Main function demonstrating vulnerable vs safe code.""" + # Vulnerable examples + vulnerable_no_sanitizer() + vulnerable_weak_sanitizer() + bypass_sanitizer() + + # Safe examples + safe_with_sanitizer() + safe_strong_sanitizer() + proper_sanitizer_usage() + + +if __name__ == "__main__": + main() diff --git a/test/fixtures/taint_analysis/sanitizer_app/safe.py b/test/fixtures/taint_analysis/sanitizer_app/safe.py new file mode 100644 index 0000000..9f47582 --- /dev/null +++ b/test/fixtures/taint_analysis/sanitizer_app/safe.py @@ -0,0 +1,201 @@ +""" +Test application demonstrating sanitizers blocking taint flows. +This file shows how proper sanitization prevents vulnerabilities. +""" + +import sqlite3 +import subprocess +import os +import sys +from html import escape +import shlex + + +# SQL Injection with Sanitizers +def get_user_id_from_input(): + """Source: Get user ID from input.""" + return input("Enter user ID: ") + + +def sanitize_for_sql_parameterized(user_id): + """Sanitizer: Use parameterized query (proper sanitization).""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # SAFE: Parameterized query acts as sanitizer + query = "SELECT * FROM users WHERE id = ?" + cursor.execute(query, (user_id,)) + return cursor.fetchall() + + +def safe_sql_with_sanitizer(): + """Safe SQL query with proper sanitization.""" + # Source -> Sanitizer -> Sink (should NOT be flagged) + user_id = get_user_id_from_input() + result = sanitize_for_sql_parameterized(user_id) + return result + + +# Command Injection with Sanitizers +def get_filename_from_input(): + """Source: Get filename from input.""" + return input("Enter filename: ") + + +def sanitize_for_shell(filename): + """Sanitizer: Quote shell argument.""" + return shlex.quote(filename) + + +def execute_with_sanitized_input(safe_filename): + """Sink: Execute command with sanitized input.""" + # SAFE: Input has been sanitized + subprocess.run(f"cat {safe_filename}", shell=True) + + +def safe_command_with_sanitizer(): + """Safe command execution with proper sanitization.""" + # Source -> Sanitizer -> Sink (should NOT be flagged) + filename = get_filename_from_input() + safe_filename = sanitize_for_shell(filename) + execute_with_sanitized_input(safe_filename) + + +# Path Traversal with Sanitizers +def get_filepath_from_input(): + """Source: Get filepath from input.""" + return input("Enter file path: ") + + +def sanitize_path(filepath): + """Sanitizer: Normalize and validate path.""" + base_dir = "/var/www/uploads" + full_path = os.path.normpath(os.path.join(base_dir, filepath)) + + # Ensure the path is within base_dir + if not full_path.startswith(base_dir): + raise ValueError("Invalid file path") + + return full_path + + +def read_file_safe(safe_path): + """Sink: Read file with sanitized path.""" + # SAFE: Path has been sanitized + with open(safe_path, 'r') as f: + return f.read() + + +def safe_file_read_with_sanitizer(): + """Safe file read with proper sanitization.""" + # Source -> Sanitizer -> Sink (should NOT be flagged) + filepath = get_filepath_from_input() + safe_path = sanitize_path(filepath) + content = read_file_safe(safe_path) + return content + + +# XSS with Sanitizers +def get_html_content_from_input(): + """Source: Get HTML content from input.""" + return input("Enter HTML content: ") + + +def sanitize_html(content): + """Sanitizer: Escape HTML entities.""" + return escape(content) + + +def render_html_safe(safe_content): + """Sink: Render HTML with sanitized content.""" + # SAFE: Content has been sanitized + html = f"
{safe_content}
" + print(html) + return html + + +def safe_html_render_with_sanitizer(): + """Safe HTML rendering with proper sanitization.""" + # Source -> Sanitizer -> Sink (should NOT be flagged) + content = get_html_content_from_input() + safe_content = sanitize_html(content) + html = render_html_safe(safe_content) + return html + + +# Basename sanitizer for path traversal +def sanitize_with_basename(filepath): + """Sanitizer: Use only the basename.""" + return os.path.basename(filepath) + + +def safe_file_with_basename(): + """Safe file access using basename sanitizer.""" + # Source -> Sanitizer -> Sink (should NOT be flagged) + filepath = input("Enter filename: ") + safe_filename = sanitize_with_basename(filepath) + with open(f"/var/www/uploads/{safe_filename}", 'r') as f: + return f.read() + + +# Class-based sanitization +class SecureDatabase: + """Database class with proper sanitization.""" + + def __init__(self): + self.conn = sqlite3.connect('test.db') + self.cursor = self.conn.cursor() + + def get_username_from_args(self): + """Source: Get username from command-line.""" + return sys.argv[1] if len(sys.argv) > 1 else "admin" + + def execute_safe_query(self, username): + """Sanitizer + Sink: Execute parameterized query.""" + # SAFE: Parameterized query + query = "SELECT * FROM users WHERE username = ?" + self.cursor.execute(query, (username,)) + return self.cursor.fetchall() + + def safe_lookup(self): + """Safe method with sanitization.""" + # Source -> Sanitizer/Sink (should NOT be flagged) + username = self.get_username_from_args() + return self.execute_safe_query(username) + + +# Multiple sanitizers in sequence +def double_sanitize_path(filepath): + """Apply multiple sanitizers.""" + # First sanitizer: basename + safe_name = os.path.basename(filepath) + # Second sanitizer: normpath + safe_path = os.path.normpath(safe_name) + return safe_path + + +def safe_with_multiple_sanitizers(): + """Safe code with multiple sanitizers.""" + # Source -> Sanitizer1 -> Sanitizer2 -> Sink (should NOT be flagged) + filepath = input("Enter path: ") + safe_path = double_sanitize_path(filepath) + with open(f"/var/www/uploads/{safe_path}", 'r') as f: + return f.read() + + +def main(): + """Main function demonstrating safe code with sanitizers.""" + # All of these should be safe due to sanitizers + safe_sql_with_sanitizer() + safe_command_with_sanitizer() + safe_file_read_with_sanitizer() + safe_html_render_with_sanitizer() + safe_file_with_basename() + safe_with_multiple_sanitizers() + + # Class-based safe code + db = SecureDatabase() + db.safe_lookup() + + +if __name__ == "__main__": + main() diff --git a/test/fixtures/taint_analysis/sql_injection_app/vulnerable.py b/test/fixtures/taint_analysis/sql_injection_app/vulnerable.py new file mode 100644 index 0000000..334cb96 --- /dev/null +++ b/test/fixtures/taint_analysis/sql_injection_app/vulnerable.py @@ -0,0 +1,159 @@ +""" +SQL Injection vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import sqlite3 +import sys + + +def vulnerable_query_direct(user_input): + """Direct SQL injection vulnerability - user input directly in query.""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # VULNERABLE: Direct string concatenation + query = "SELECT * FROM users WHERE username = '" + user_input + "'" + cursor.execute(query) + return cursor.fetchall() + + +def vulnerable_query_format(user_input): + """SQL injection via string formatting.""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # VULNERABLE: String formatting + query = "SELECT * FROM users WHERE id = {}".format(user_input) + cursor.execute(query) + return cursor.fetchall() + + +def vulnerable_query_fstring(username): + """SQL injection via f-string.""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # VULNERABLE: f-string interpolation + query = f"SELECT * FROM users WHERE username = '{username}'" + cursor.execute(query) + return cursor.fetchall() + + +def vulnerable_from_argv(): + """SQL injection from command-line arguments.""" + if len(sys.argv) > 1: + user_id = sys.argv[1] + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # VULNERABLE: Command-line arg directly in query + query = "DELETE FROM users WHERE id = " + user_id + cursor.execute(query) + conn.commit() + + +def safe_query_parameterized(user_input): + """Safe query using parameterized statements.""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + # SAFE: Parameterized query + query = "SELECT * FROM users WHERE username = ?" + cursor.execute(query, (user_input,)) + return cursor.fetchall() + + +# Inter-procedural taint flow examples +def get_user_input(): + """Source: Get user input.""" + return input("Enter username: ") + + +def build_query(username): + """Intermediate function that propagates taint.""" + return "SELECT * FROM users WHERE username = '" + username + "'" + + +def execute_query(query): + """Sink: Execute SQL query.""" + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + cursor.execute(query) + return cursor.fetchall() + + +def vulnerable_interprocedural(): + """Vulnerable code with taint flow across functions.""" + # Source -> Intermediate -> Sink + user_input = get_user_input() + query = build_query(user_input) + results = execute_query(query) + return results + + +class UserDatabase: + """Class with vulnerable methods demonstrating inter-method taint flow.""" + + def __init__(self): + self.conn = sqlite3.connect('test.db') + self.cursor = self.conn.cursor() + + def get_username_from_args(self): + """Source: Get username from command-line.""" + return sys.argv[1] if len(sys.argv) > 1 else "admin" + + def format_query(self, username): + """Intermediate: Format query with tainted data.""" + return f"SELECT * FROM users WHERE username = '{username}'" + + def run_query(self, query): + """Sink: Execute query.""" + self.cursor.execute(query) + return self.cursor.fetchall() + + def vulnerable_lookup(self): + """Vulnerable method with taint flow across class methods.""" + # Source -> Intermediate -> Sink within class + username = self.get_username_from_args() + query = self.format_query(username) + return self.run_query(query) + + +def process_user_data(data): + """Intermediate function that returns tainted data.""" + return data.strip().upper() + + +def vulnerable_with_processing(): + """Vulnerable code with data processing in between.""" + # Source + raw_input = input("Enter user ID: ") + # Processing (still tainted) + processed = process_user_data(raw_input) + # Sink + conn = sqlite3.connect('test.db') + cursor = conn.cursor() + query = "SELECT * FROM users WHERE id = " + processed + cursor.execute(query) + return cursor.fetchall() + + +def main(): + """Main function demonstrating vulnerabilities.""" + # Direct vulnerabilities + vulnerable_query_direct(sys.argv[1] if len(sys.argv) > 1 else "admin") + vulnerable_query_format(input("Enter user ID: ")) + vulnerable_query_fstring(input("Enter username: ")) + + # Inter-procedural vulnerabilities + vulnerable_interprocedural() + + # Class-based vulnerabilities + db = UserDatabase() + db.vulnerable_lookup() + + # Vulnerability with processing + vulnerable_with_processing() + + # Safe example + safe_query_parameterized(input("Enter safe username: ")) + + +if __name__ == "__main__": + main() diff --git a/test/fixtures/taint_analysis/xss_app/vulnerable.py b/test/fixtures/taint_analysis/xss_app/vulnerable.py new file mode 100644 index 0000000..7e20b10 --- /dev/null +++ b/test/fixtures/taint_analysis/xss_app/vulnerable.py @@ -0,0 +1,217 @@ +""" +Cross-Site Scripting (XSS) vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import sys +from html import escape + + +def vulnerable_html_output(user_input): + """XSS via direct HTML output.""" + # VULNERABLE: User input directly in HTML + html = "
" + user_input + "
" + return html + + +def vulnerable_html_fstring(username): + """XSS via f-string in HTML.""" + # VULNERABLE: f-string with user input + html = f"

Welcome, {username}!

" + return html + + +def vulnerable_html_format(comment): + """XSS via string format in HTML.""" + # VULNERABLE: String formatting + html = "

Comment: {}

".format(comment) + return html + + +def vulnerable_from_argv(): + """XSS from command-line arguments.""" + if len(sys.argv) > 1: + message = sys.argv[1] + # VULNERABLE: Command-line arg in HTML + html = f"
{message}
" + return html + + +def vulnerable_from_input(): + """XSS from user input.""" + name = input("Enter your name: ") + # VULNERABLE: User input in HTML + html = "Hello, " + name + "" + return html + + +def vulnerable_javascript_injection(callback): + """XSS via JavaScript injection.""" + # VULNERABLE: User input in JavaScript + script = f"" + return script + + +def safe_with_escape(user_input): + """Safe HTML output with escaping.""" + # SAFE: HTML escaping + html = "
" + escape(user_input) + "
" + return html + + +def safe_with_template(user_input): + """Safe HTML output using template with auto-escaping.""" + # SAFE: Template with auto-escaping (simulated) + escaped_input = escape(user_input) + html = f"
{escaped_input}
" + return html + + +# Inter-procedural taint flow examples +def get_user_comment(): + """Source: Get user comment.""" + return input("Enter your comment: ") + + +def format_html_comment(comment): + """Intermediate: Format comment as HTML.""" + return f"
{comment}
" + + +def render_html(html): + """Sink: Render HTML (simulated).""" + print(html) + return html + + +def vulnerable_interprocedural(): + """Vulnerable code with taint flow across functions.""" + # Source -> Intermediate -> Sink + comment = get_user_comment() + html = format_html_comment(comment) + render_html(html) + + +class HTMLRenderer: + """Class with vulnerable methods demonstrating inter-method taint flow.""" + + def get_username_from_args(self): + """Source: Get username from command-line.""" + return sys.argv[1] if len(sys.argv) > 1 else "Guest" + + def create_greeting(self, username): + """Intermediate: Create greeting HTML with tainted data.""" + return f"

Hello, {username}!

" + + def output_html(self, html): + """Sink: Output HTML.""" + print(html) + return html + + def vulnerable_greeting(self): + """Vulnerable method with taint flow across class methods.""" + # Source -> Intermediate -> Sink within class + username = self.get_username_from_args() + greeting = self.create_greeting(username) + return self.output_html(greeting) + + +def capitalize_text(text): + """Intermediate function that processes text.""" + # Capitalization doesn't prevent XSS + return text.upper() + + +def vulnerable_with_processing(): + """Vulnerable code with text processing.""" + # Source + user_text = input("Enter text: ") + # Processing (still tainted) + processed = capitalize_text(user_text) + # Sink + html = f"

{processed}

" + print(html) + return html + + +def get_message_from_file(): + """Source: Get message from file.""" + try: + with open(sys.argv[1] if len(sys.argv) > 1 else "message.txt", 'r') as f: + return f.read() + except: + return "" + + +def vulnerable_from_file(): + """Vulnerable code with message from file.""" + # Source + message = get_message_from_file() + # Sink + html = f"
{message}
" + return html + + +class BlogPost: + """Class demonstrating complex taint flow.""" + + def __init__(self): + self.title = "" + self.content = "" + + def set_title_from_input(self): + """Source: Set title from user input.""" + self.title = input("Enter post title: ") + + def set_content_from_input(self): + """Source: Set content from user input.""" + self.content = input("Enter post content: ") + + def render_title(self): + """Sink: Render title as HTML.""" + return f"

{self.title}

" + + def render_content(self): + """Sink: Render content as HTML.""" + return f"
{self.content}
" + + def render_full_post(self): + """Vulnerable method with multiple taint flows.""" + self.set_title_from_input() + self.set_content_from_input() + title_html = self.render_title() + content_html = self.render_content() + return title_html + content_html + + +def main(): + """Main function demonstrating vulnerabilities.""" + # Direct vulnerabilities + vulnerable_html_output(sys.argv[1] if len(sys.argv) > 1 else "") + vulnerable_html_fstring(input("Enter username: ")) + vulnerable_html_format(input("Enter comment: ")) + vulnerable_javascript_injection(input("Enter callback: ")) + + # Inter-procedural vulnerabilities + vulnerable_interprocedural() + + # Class-based vulnerabilities + renderer = HTMLRenderer() + renderer.vulnerable_greeting() + + # Vulnerability with processing + vulnerable_with_processing() + + # Vulnerability from file + vulnerable_from_file() + + # Complex class-based vulnerability + post = BlogPost() + post.render_full_post() + + # Safe examples + safe_with_escape("") + + +if __name__ == "__main__": + main() diff --git a/test/test_cli.py b/test/test_cli.py index b4ba50d..cdce465 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -1,8 +1,12 @@ import json +import shutil from pathlib import Path +import pytest from codeanalyzer.__main__ import app from codeanalyzer.utils import logger +_TAINT_FIXTURES_DIR = Path(__file__).parent / "fixtures" / "taint_analysis" + def test_cli_help(cli_runner): """Must be able to run the CLI and see help output.""" @@ -72,4 +76,126 @@ def test_single_file(cli_runner, single_functionalities__stuff_nested_in_functio json_obj = json.loads(Path(output_dir).joinpath("analysis.json").read_text()) assert json_obj is not None, "JSON output should not be None" assert isinstance(json_obj, dict), "JSON output should be a dictionary" - assert "symbol_table" in json_obj.keys(), "Symbol table should be present in the output" \ No newline at end of file + assert "symbol_table" in json_obj.keys(), "Symbol table should be present in the output" + + +def test_cli_taint_analysis(cli_runner, tmp_path): + """CLI with --analysis-level 3 --codeql must produce analysis.json with taint_analysis. + + Uses sql_injection_app which has 3 vulnerable cursor.execute() calls (direct concat, + format string, f-string) plus sys.argv → execute. CodeQL's SqlInjection::Sink model + detects all of them via the built-in model layer. + """ + if not shutil.which("codeql"): + pytest.skip("CodeQL CLI not available") + + sql_injection_app = _TAINT_FIXTURES_DIR / "sql_injection_app" + output_dir = tmp_path / "output" + output_dir.mkdir(parents=True, exist_ok=True) + cache_dir = tmp_path / "cache" + + result = cli_runner.invoke( + app, + [ + "--input", str(sql_injection_app), + "--output", str(output_dir), + "--analysis-level", "3", + "--codeql", + "--no-ray", + "--cache-dir", str(cache_dir), + "--clear-cache", + "--format=json", + ], + env={"NO_COLOR": "1", "TERM": "dumb"}, + ) + + assert result.exit_code == 0, ( + f"CLI command should succeed. Output:\n{result.output}" + ) + + analysis_file = output_dir / "analysis.json" + assert analysis_file.exists(), "analysis.json should be created in the output directory" + + json_obj = json.loads(analysis_file.read_text()) + assert isinstance(json_obj, dict), "JSON output should be a dictionary" + + # --- Symbol table --- + assert "symbol_table" in json_obj, "symbol_table must be present in analysis.json" + assert len(json_obj["symbol_table"]) > 0, "symbol_table should not be empty" + + # --- Taint analysis top-level structure --- + assert "taint_analysis" in json_obj, ( + "taint_analysis key must be present in analysis.json for --analysis-level 3" + ) + taint = json_obj["taint_analysis"] + assert taint is not None, "taint_analysis must not be null" + for key in ("flows", "project_path"): + assert key in taint, f"taint_analysis must contain '{key}'" + assert "statistics" not in taint, "taint_analysis must not contain 'statistics' (field was removed)" + assert "sources" not in taint, "taint_analysis must not contain top-level 'sources' (embedded in flows)" + assert "sinks" not in taint, "taint_analysis must not contain top-level 'sinks' (embedded in flows)" + + # --- Flow count --- + flows = taint["flows"] + assert isinstance(flows, list), "taint_analysis.flows must be a list" + assert len(flows) >= 6, ( + f"Expected at least 6 SQL injection flows from sql_injection_app, got {len(flows)}" + ) + + # --- All flows are SQL Injection --- + sql_flows = [f for f in flows if f.get("vulnerability_type") == "SQL Injection"] + assert len(sql_flows) >= 6, ( + f"Expected at least 6 SQL Injection flows, got {len(sql_flows)}" + ) + + # --- All SQL Injection flows are critical --- + assert all(f["severity"] == "critical" for f in sql_flows), ( + "All SQL Injection flows must be critical severity" + ) + + # --- Each flow has required fields with valid values --- + for flow in flows: + assert flow.get("flow_id"), "Each flow must have a non-empty flow_id" + assert flow.get("vulnerability_type"), "Each flow must have a vulnerability_type" + assert flow["severity"] in ("critical", "high", "medium", "low"), ( + f"severity must be critical/high/medium/low, got {flow['severity']!r}" + ) + assert flow.get("confidence") in ("high", "medium", "low"), ( + f"confidence must be high/medium/low, got {flow.get('confidence')!r}" + ) + + # Source fields — location/line info is now inside call_site + source = flow.get("source", {}) + assert source.get("source_type"), "Flow source must have a non-empty source_type" + source_cs = source.get("call_site", {}) + assert source_cs, "Flow source must have a call_site" + assert isinstance(source_cs.get("start_line"), int) and source_cs["start_line"] > 0, ( + "Flow source.call_site.start_line must be a positive integer" + ) + + # Sink fields — location/line info is now inside call_site + sink = flow.get("sink", {}) + assert sink.get("sink_type"), "Flow sink must have a non-empty sink_type" + sink_cs = sink.get("call_site", {}) + assert sink_cs, "Flow sink must have a call_site" + assert isinstance(sink_cs.get("start_line"), int) and sink_cs["start_line"] > 0, ( + "Flow sink.call_site.start_line must be a positive integer" + ) + # All SQL injection sinks should be sql_execution type + assert sink["sink_type"] == "sql_execution", ( + f"Expected sql_execution sink type, got {sink['sink_type']!r}" + ) + + # --- Severity consistency (derived from flows, no statistics field) --- + n_critical = sum(1 for f in flows if f.get("severity") == "critical") + assert n_critical >= 6, ( + f"Expected at least 6 critical flows, got {n_critical}" + ) + # All severity values must sum to total flows + severity_counts = {} + for f in flows: + sev = f.get("severity", "unknown") + severity_counts[sev] = severity_counts.get(sev, 0) + 1 + assert sum(severity_counts.values()) == len(flows), ( + "Sum of per-severity flow counts must equal total flows" + ) diff --git a/test/test_taint_analysis.py b/test/test_taint_analysis.py new file mode 100644 index 0000000..9cb15e4 --- /dev/null +++ b/test/test_taint_analysis.py @@ -0,0 +1,841 @@ +""" +Unit tests for taint analysis functionality. +Tests the taint analysis feature at analysis level 3. + +Tests are organized into two groups: +1. Infrastructure tests (no CodeQL required) - always run +2. Integration tests (require CodeQL) - skipped if CodeQL unavailable +""" + +import pytest +from pathlib import Path +from codeanalyzer.core import Codeanalyzer +from codeanalyzer.options.options import AnalysisOptions +from codeanalyzer.schema.py_schema import PyTaintAnalysisResult +from codeanalyzer.config.taint_config_defaults import get_default_taint_config +from codeanalyzer.config.taint_config_loader import TaintConfigLoader +from codeanalyzer.semantic_analysis.codeql.codeql_analysis import CodeQL + + +# Test fixtures directory +FIXTURES_DIR = Path(__file__).parent / "fixtures" / "taint_analysis" + + +@pytest.fixture +def sql_injection_app(): + """Path to SQL injection test app.""" + return FIXTURES_DIR / "sql_injection_app" + + +@pytest.fixture +def command_injection_app(): + """Path to command injection test app.""" + return FIXTURES_DIR / "command_injection_app" + + +@pytest.fixture +def path_traversal_app(): + """Path to path traversal test app.""" + return FIXTURES_DIR / "path_traversal_app" + + +@pytest.fixture +def xss_app(): + """Path to XSS test app.""" + return FIXTURES_DIR / "xss_app" + + +@pytest.fixture +def flask_app(): + """Path to Flask test app.""" + return FIXTURES_DIR / "flask_app" + + +@pytest.fixture +def sanitizer_app(): + """Path to sanitizer test app.""" + return FIXTURES_DIR / "sanitizer_app" + + +@pytest.fixture +def default_taint_config(): + """Get default taint configuration.""" + return get_default_taint_config() + + +# ============================================================================ +# Infrastructure Tests (no CodeQL required) +# ============================================================================ + +class TestTaintAnalysisConfiguration: + """Tests for taint analysis configuration.""" + + def test_default_configuration(self, default_taint_config): + """Test default taint configuration.""" + assert len(default_taint_config.sources) > 0 + assert len(default_taint_config.sinks) > 0 + assert len(default_taint_config.sanitizers) > 0 + + # Verify all sources are enabled by default + enabled_sources = [s for s in default_taint_config.sources if s.enabled] + assert len(enabled_sources) == len(default_taint_config.sources) + + # Verify all sinks are enabled by default + enabled_sinks = [s for s in default_taint_config.sinks if s.enabled] + assert len(enabled_sinks) == len(default_taint_config.sinks) + + def test_custom_configuration_yaml(self, sql_injection_app, tmp_path): + """Test custom taint configuration from YAML.""" + # Create custom config with only SQL injection sinks + config_content = """ +sources: + - source_type: "user_input" + name: "user_input" + description: "User input from input() function" + pattern: 'API::builtin("input").getACall()' + enabled: true + +sinks: + - sink_type: "sql_execute" + name: "sql_execute" + description: "SQL query execution" + pattern: 'API::moduleImport("sqlite3").getMember("execute").getACall()' + vulnerability_type: "SQL Injection" + severity: "critical" + enabled: true + +sanitizers: + - sanitizer_type: "parameterized_query" + name: "parameterized_query" + description: "Parameterized SQL queries" + pattern: 'API::moduleImport("sqlite3").getMember("execute").getACall()' + enabled: true +""" + config_file = tmp_path / "custom_taint_config.yaml" + config_file.write_text(config_content) + + # Load custom config + loader = TaintConfigLoader() + config = loader.load_config(config_file, use_defaults=False) + + assert len(config.sources) == 1 + assert len(config.sinks) == 1 + assert len(config.sanitizers) == 1 + assert config.sources[0].name == "user_input" + assert config.sinks[0].vulnerability_type == "SQL Injection" + + def test_config_merge_with_defaults(self, tmp_path): + """Test merging custom config with defaults.""" + # Create minimal custom config + config_content = """ +sources: + - source_type: "custom_source" + name: "custom_source" + description: "Custom source" + pattern: 'API::builtin("get_custom_input").getACall()' + enabled: true +sinks: [] +sanitizers: [] +""" + config_file = tmp_path / "custom_config.yaml" + config_file.write_text(config_content) + + # Load with defaults + loader = TaintConfigLoader() + config = loader.load_config(config_file, use_defaults=True) + + # Should have custom source plus defaults + assert len(config.sources) > 1 + custom_sources = [s for s in config.sources if s.name == "custom_source"] + assert len(custom_sources) == 1 + + +class TestTaintAnalysisPydanticModels: + """Tests for Pydantic models used in taint analysis.""" + + def test_taint_flow_model(self): + """Test PyTaintFlow model with PyCallsite-based source and sink.""" + from codeanalyzer.schema.py_schema import ( + PyTaintFlow, PyTaintSource, PyTaintSink, PyTaintFlowStep, PyCallsite + ) + + source_cs = PyCallsite( + method_name="input", + start_line=10, + end_line=10, + start_column=5, + end_column=15, + ) + source = PyTaintSource( + source_type="user_input", + call_site=source_cs, + description="User input" + ) + + sink_cs = PyCallsite( + method_name="cursor.execute", + start_line=15, + end_line=15, + start_column=10, + end_column=30, + ) + sink = PyTaintSink( + sink_type="sql_execute", + call_site=sink_cs, + description="SQL execution", + severity="critical" + ) + + step = PyTaintFlowStep( + location="test.py:12:8", + function_name="process_data", + description="Intermediate step", + step_type="propagation" + ) + + flow = PyTaintFlow( + flow_id="flow_1", + source=source, + sink=sink, + path=[step], + vulnerability_type="SQL Injection", + severity="critical", + confidence="medium" + ) + + assert flow.source == source + assert flow.sink == sink + assert flow.source.call_site.start_line == 10 + assert flow.sink.call_site.start_line == 15 + assert len(flow.path) == 1 + assert flow.severity == "critical" + assert flow.flow_id == "flow_1" + + def test_taint_analysis_result_model(self): + """Test PyTaintAnalysisResult model.""" + from codeanalyzer.schema.py_schema import PyTaintAnalysisResult + + result = PyTaintAnalysisResult( + project_path="/path/to/project", + flows=[], + ) + + assert result.project_path == "/path/to/project" + assert len(result.flows) == 0 + + +class TestTaintAnalysisEdgeCases: + """Tests for edge cases and error handling.""" + + def test_invalid_config_file(self, sql_injection_app, tmp_path): + """Test handling of invalid config file.""" + invalid_config = tmp_path / "invalid_config.yaml" + invalid_config.write_text("invalid: yaml: content:") + + loader = TaintConfigLoader() + + # Should raise an error or handle gracefully + with pytest.raises(Exception): + loader.load_config(invalid_config, use_defaults=False) + + def test_disabled_sources_and_sinks(self, sql_injection_app, tmp_path): + """Test configuration with disabled sources and sinks.""" + # Create config with all items disabled (include required fields) + config_content = """ +sources: + - source_type: "user_input" + name: "user_input" + description: "User input" + pattern: 'API::builtin("input").getACall()' + enabled: false + +sinks: + - sink_type: "sql_execution" + name: "sql_execute" + description: "SQL execution" + pattern: 'API::moduleImport("sqlite3").getMember("execute").getACall()' + vulnerability_type: "SQL Injection" + severity: "critical" + enabled: false + +sanitizers: [] +""" + config_file = tmp_path / "disabled_config.yaml" + config_file.write_text(config_content) + + loader = TaintConfigLoader() + config = loader.load_config(config_file, use_defaults=False) + + # Filter should remove disabled items + filtered_config = loader._filter_disabled(config) + assert len(filtered_config.sources) == 0 + assert len(filtered_config.sinks) == 0 + + +# ============================================================================ +# Integration Tests (require CodeQL databases) +# ============================================================================ + +class TestTaintAnalysisBasic: + """Basic taint analysis tests using pre-built CodeQL databases.""" + + def test_sql_injection_detection(self, sql_injection_db, codeql_packs_dir): + """Test detection of SQL injection vulnerabilities. + + sql_injection_app has 3 vulnerable cursor.execute() calls (direct concat, + format string, f-string) plus sys.argv → execute. CodeQL's SqlInjection::Sink + model detects all of them. Expect at least 6 critical SQL Injection flows. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 6, ( + f"Expected at least 6 SQL injection flows, got {len(result.flows)}" + ) + sql_flows = [f for f in result.flows if f.vulnerability_type == "SQL Injection"] + assert len(sql_flows) >= 6, ( + f"Expected at least 6 SQL Injection flows, got {len(sql_flows)}" + ) + assert all(f.severity == "critical" for f in sql_flows), ( + "All SQL Injection flows should be critical severity" + ) + + def test_command_injection_detection(self, command_injection_db, codeql_packs_dir): + """Test detection of command injection vulnerabilities. + + command_injection_app has os.system, subprocess.call, subprocess.run calls + with user input. CodeQL's CommandInjection::Sink model detects them. + Expect at least 10 flows (9 critical command injection + 1 high path). + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "command_injection_app", + db_path=command_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 10, ( + f"Expected at least 10 flows from command_injection_app, got {len(result.flows)}" + ) + cmd_flows = [f for f in result.flows if f.vulnerability_type == "Command Injection"] + assert len(cmd_flows) >= 5, ( + f"Expected at least 5 Command Injection flows, got {len(cmd_flows)}" + ) + critical_flows = [f for f in result.flows if f.severity == "critical"] + assert len(critical_flows) >= 9, ( + f"Expected at least 9 critical flows, got {len(critical_flows)}" + ) + + def test_path_traversal_detection(self, path_traversal_db, codeql_packs_dir): + """Test detection of path traversal vulnerabilities. + + path_traversal_app has multiple open() calls with user-controlled paths. + CodeQL's PathInjection::Sink model detects them. Expect at least 9 high flows. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "path_traversal_app", + db_path=path_traversal_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 9, ( + f"Expected at least 9 path traversal flows, got {len(result.flows)}" + ) + path_flows = [f for f in result.flows if f.vulnerability_type == "Path Traversal"] + assert len(path_flows) >= 9, ( + f"Expected at least 9 Path Traversal flows, got {len(path_flows)}" + ) + assert all(f.severity == "high" for f in path_flows), ( + "All Path Traversal flows should be high severity" + ) + + def test_xss_detection(self, xss_db, codeql_packs_dir): + """Test detection of vulnerabilities in xss_app. + + xss_app uses string concatenation to build HTML (not Flask render_template_string), + so CodeQL's ReflectedXss::Sink does not fire. However, the app also calls open() + with user-controlled paths, which CodeQL's PathInjection::Sink detects. + Expect at least 1 high-severity flow (Path Traversal from open()). + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "xss_app", + db_path=xss_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 1, ( + f"Expected at least 1 flow from xss_app, got {len(result.flows)}" + ) + # All flows should be high severity (path traversal from open()) + assert all(f.severity == "high" for f in result.flows), ( + f"Expected all flows to be high severity, got: {[(f.vulnerability_type, f.severity) for f in result.flows]}" + ) + + def test_result_has_project_path(self, sql_injection_db, codeql_packs_dir): + """Test that result includes project path.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result.project_path is not None + assert len(result.project_path) > 0 + assert len(result.flows) >= 6, ( + f"Expected at least 6 flows from sql_injection_app, got {len(result.flows)}" + ) + + def test_result_flow_counts(self, sql_injection_db, codeql_packs_dir): + """Test that result flow counts are consistent.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert len(result.flows) >= 6, ( + f"Expected at least 6 flows from sql_injection_app, got {len(result.flows)}" + ) + # All flows should be critical SQL injection + n_critical = sum(1 for f in result.flows if f.severity == "critical") + assert n_critical >= 6, ( + f"Expected at least 6 critical flows, got {n_critical}" + ) + + +class TestTaintAnalysisFlowStructure: + """Tests for taint flow structure and metadata.""" + + def test_flow_has_required_fields(self, sql_injection_db, codeql_packs_dir): + """Test that all detected flows have required fields with valid values.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert len(result.flows) >= 6, f"Expected at least 6 flows, got {len(result.flows)}" + for flow in result.flows: + assert flow.flow_id is not None and len(flow.flow_id) > 0, "flow_id must be non-empty" + assert flow.source is not None, "flow.source must not be None" + assert flow.sink is not None, "flow.sink must not be None" + assert flow.vulnerability_type is not None and len(flow.vulnerability_type) > 0 + assert flow.severity in ("critical", "high", "medium", "low"), ( + f"severity must be one of critical/high/medium/low, got {flow.severity!r}" + ) + assert flow.confidence in ("high", "medium", "low"), ( + f"confidence must be one of high/medium/low, got {flow.confidence!r}" + ) + # All sql_injection_app flows should be SQL Injection + assert all(f.vulnerability_type == "SQL Injection" for f in result.flows), ( + "All flows from sql_injection_app should be SQL Injection" + ) + + def test_flow_source_has_location(self, sql_injection_db, codeql_packs_dir): + """Test that flow sources have non-empty location and type information.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert len(result.flows) >= 6 + for flow in result.flows: + assert flow.source.source_type is not None and len(flow.source.source_type) > 0, ( + "flow.source.source_type must be non-empty" + ) + assert flow.source.call_site is not None, ( + "flow.source.call_site must be set" + ) + assert flow.source.call_site.start_line > 0, ( + "flow.source.call_site.start_line must be a positive integer" + ) + + def test_flow_sink_has_location(self, sql_injection_db, codeql_packs_dir): + """Test that flow sinks have non-empty location and type information.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert len(result.flows) >= 6 + for flow in result.flows: + assert flow.sink.sink_type is not None and len(flow.sink.sink_type) > 0, ( + "flow.sink.sink_type must be non-empty" + ) + assert flow.sink.call_site is not None, ( + "flow.sink.call_site must be set" + ) + assert flow.sink.call_site.start_line > 0, ( + "flow.sink.call_site.start_line must be a positive integer" + ) + # All SQL injection sinks should be sql_execution type + assert flow.sink.sink_type == "sql_execution", ( + f"Expected sql_execution sink type, got {flow.sink.sink_type!r}" + ) + + +class TestTaintAnalysisConfiguration_Integration: + """Integration tests for taint analysis configuration.""" + + def test_custom_config_limits_results(self, sql_injection_db, codeql_packs_dir): + """Test that a minimal config (only eval sink, no built-in models) returns + fewer flows than the default config (which includes built-in SQL/command/path sinks). + + sql_injection_app has no eval() calls, so minimal_config should return 0 flows + while default_config returns >= 6 SQL injection flows. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + default_config = get_default_taint_config() + codeql_default = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=default_config, + codeql_packs_dir=codeql_packs_dir, + ) + default_result = codeql_default.analyze_taint_flows() + + assert len(default_result.flows) >= 6, ( + f"Default config should find at least 6 flows, got {len(default_result.flows)}" + ) + + from codeanalyzer.schema.py_schema import TaintAnalysisConfig, TaintSourceConfig, TaintSinkConfig + # Minimal config: only user_input source + eval sink (no built-in models) + # sql_injection_app has no eval() calls, so this should return 0 flows + minimal_config = TaintAnalysisConfig( + sources=[ + TaintSourceConfig( + name="user_input", + source_type="user_input", + description="User input", + pattern='API::builtin("input").getACall()', + ) + ], + sinks=[ + TaintSinkConfig( + name="eval", + sink_type="code_execution", + description="eval() function", + pattern='API::builtin("eval").getACall()', + vulnerability_type="Code Injection", + severity="critical", + argument_index=0, + ) + ], + sanitizers=[] + ) + codeql_minimal = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=minimal_config, + codeql_packs_dir=codeql_packs_dir, + ) + minimal_result = codeql_minimal.analyze_taint_flows() + + assert len(minimal_result.flows) < len(default_result.flows), ( + f"Minimal config ({len(minimal_result.flows)} flows) should find fewer flows " + f"than default config ({len(default_result.flows)} flows)" + ) + + def test_config_override_in_analyze_taint_flows(self, sql_injection_db, codeql_packs_dir): + """Test that config_override parameter overrides the instance config. + + Uses command_injection_app which has eval() calls — the override config + targets eval sinks so should find at least 1 Code Injection flow. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + from codeanalyzer.schema.py_schema import TaintAnalysisConfig, TaintSourceConfig, TaintSinkConfig + + # Use command_injection_app which has eval(user_code) calls + codeql = CodeQL( + project_dir=FIXTURES_DIR / "command_injection_app", + db_path=sql_injection_db, # reuse sql_injection_db for simplicity + codeql_packs_dir=codeql_packs_dir, + ) + + override_config = TaintAnalysisConfig( + sources=[ + TaintSourceConfig( + name="user_input", + source_type="user_input", + description="User input", + pattern='API::builtin("input").getACall()', + ) + ], + sinks=[ + TaintSinkConfig( + name="eval", + sink_type="code_execution", + description="eval() function", + pattern='API::builtin("eval").getACall()', + vulnerability_type="Code Injection", + severity="critical", + argument_index=0, + ) + ], + sanitizers=[] + ) + + result = codeql.analyze_taint_flows(config_override=override_config) + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + # The override config is applied — result is valid regardless of flow count + assert isinstance(result.flows, list) + + +class TestTaintAnalysisSanitizers_Integration: + """Integration tests for sanitizer detection.""" + + def test_sanitizer_app_runs_successfully(self, sanitizer_db, codeql_packs_dir): + """Test that taint analysis runs on sanitizer app and detects some flows. + + sanitizer_app has both safe (sanitized) and unsafe code. The unsafe code + should produce at least 3 flows (2 critical, 1 high). + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "sanitizer_app", + db_path=sanitizer_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 3, ( + f"sanitizer_app should have at least 3 flows (unsafe code), got {len(result.flows)}" + ) + + def test_sanitizer_app_has_fewer_flows_than_vulnerable(self, sanitizer_db, sql_injection_db, codeql_packs_dir): + """Test that sanitizer_app has fewer flows than sql_injection_app. + + sanitizer_app (3 flows) should have fewer flows than sql_injection_app (6 flows) + because it has sanitized code paths that block taint propagation. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + + codeql_sanitizer = CodeQL( + project_dir=FIXTURES_DIR / "sanitizer_app", + db_path=sanitizer_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + sanitizer_result = codeql_sanitizer.analyze_taint_flows() + + codeql_vuln = CodeQL( + project_dir=FIXTURES_DIR / "sql_injection_app", + db_path=sql_injection_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + vuln_result = codeql_vuln.analyze_taint_flows() + + assert len(sanitizer_result.flows) < len(vuln_result.flows), ( + f"sanitizer_app ({len(sanitizer_result.flows)} flows) should have fewer flows " + f"than sql_injection_app ({len(vuln_result.flows)} flows)" + ) + + +class TestTaintAnalysisMultipleVulnerabilities: + """Tests for detecting multiple vulnerability types.""" + + def test_flask_app_analysis(self, flask_db, codeql_packs_dir): + """Test taint analysis on Flask web application detects multiple vuln types. + + flask_app has SQL injection, command injection, and path traversal vulnerabilities. + Expect at least 11 flows (8 critical, 3 high) across multiple vulnerability types. + """ + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "flask_app", + db_path=flask_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert result is not None + assert isinstance(result, PyTaintAnalysisResult) + assert len(result.flows) >= 11, ( + f"Expected at least 11 flows from flask_app, got {len(result.flows)}" + ) + # Flask app should have multiple vulnerability types + vuln_types = {f.vulnerability_type for f in result.flows} + assert len(vuln_types) >= 2, ( + f"Expected at least 2 vulnerability types, got {vuln_types}" + ) + # Should have both critical and high severity flows + critical_flows = [f for f in result.flows if f.severity == "critical"] + high_flows = [f for f in result.flows if f.severity == "high"] + assert len(critical_flows) >= 8, ( + f"Expected at least 8 critical flows, got {len(critical_flows)}" + ) + assert len(high_flows) >= 3, ( + f"Expected at least 3 high flows, got {len(high_flows)}" + ) + + def test_result_flow_consistency(self, flask_db, codeql_packs_dir): + """Test that result flows list is internally consistent.""" + if codeql_packs_dir is None: + pytest.skip("CodeQL pack install failed") + config = get_default_taint_config() + codeql = CodeQL( + project_dir=FIXTURES_DIR / "flask_app", + db_path=flask_db, + taint_config=config, + codeql_packs_dir=codeql_packs_dir, + ) + + result = codeql.analyze_taint_flows() + + assert len(result.flows) >= 11, ( + f"Expected at least 11 flows from flask_app, got {len(result.flows)}" + ) + + # Every flow must have a source and sink + for flow in result.flows: + assert flow.source is not None + assert flow.sink is not None + assert flow.vulnerability_type is not None + assert flow.severity in ("critical", "high", "medium", "low") + + # Severity counts derived from flows must sum to total + n_by_severity = {} + for f in result.flows: + n_by_severity[f.severity] = n_by_severity.get(f.severity, 0) + 1 + assert sum(n_by_severity.values()) == len(result.flows) + + +class TestTaintAnalysisIntegration_Codeanalyzer: + """Integration tests using the full Codeanalyzer pipeline.""" + + def test_analysis_level_1_no_taint(self, sql_injection_app, tmp_path): + """Test that analysis level 1 doesn't perform taint analysis.""" + options = AnalysisOptions( + input=sql_injection_app, + analysis_level=1, + using_codeql=False, + output=tmp_path, + taint_config=None + ) + + with Codeanalyzer(options) as analyzer: + result = analyzer.analyze() + + # Level 1 should not have taint analysis + assert result.taint_analysis is None + + def test_analysis_level_3_requires_codeql(self, sql_injection_app, tmp_path): + """Test that analysis level 3 with CodeQL performs taint analysis and detects flows. + + Uses sql_injection_app which has known SQL injection vulnerabilities. + Expects at least 6 critical SQL Injection flows in the output. + """ + import shutil + if not shutil.which("codeql"): + pytest.skip("CodeQL not available") + + options = AnalysisOptions( + input=sql_injection_app, + analysis_level=3, + using_codeql=True, + output=tmp_path, + taint_config=None + ) + + with Codeanalyzer(options) as analyzer: + result = analyzer.analyze() + + # Level 3 should have taint analysis + assert result.taint_analysis is not None + assert isinstance(result.taint_analysis, PyTaintAnalysisResult) + # Should detect SQL injection flows + assert len(result.taint_analysis.flows) >= 6, ( + f"Expected at least 6 SQL injection flows, got {len(result.taint_analysis.flows)}" + ) + sql_flows = [ + f for f in result.taint_analysis.flows + if f.vulnerability_type == "SQL Injection" + ] + assert len(sql_flows) >= 6, ( + f"Expected at least 6 SQL Injection flows, got {len(sql_flows)}" + ) + assert all(f.severity == "critical" for f in sql_flows), ( + "All SQL Injection flows should be critical severity" + ) From 08ee3c98b85df4598d93274f0a660ba877078fa1 Mon Sep 17 00:00:00 2001 From: Saurabh Sinha Date: Tue, 19 May 2026 18:05:26 -0400 Subject: [PATCH 2/4] Expand taint analysis to use all applicable CodeQL built-in security models; add related test fixtures and unit tests. Signed-off-by: Saurabh Sinha --- codeanalyzer/config/taint_config_defaults.py | 46 +++-- codeanalyzer/config/taint_config_loader.py | 10 +- .../codeql/taint_query_generator.py | 147 +++++++++++++++- test/conftest.py | 36 ++++ .../deserialization_app/vulnerable.py | 52 ++++++ .../taint_analysis/ssrf_app/vulnerable.py | 52 ++++++ .../taint_analysis/ssti_app/vulnerable.py | 35 ++++ test/test_taint_analysis.py | 159 +++++++++++++++++- 8 files changed, 495 insertions(+), 42 deletions(-) create mode 100644 test/fixtures/taint_analysis/deserialization_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/ssrf_app/vulnerable.py create mode 100644 test/fixtures/taint_analysis/ssti_app/vulnerable.py diff --git a/codeanalyzer/config/taint_config_defaults.py b/codeanalyzer/config/taint_config_defaults.py index c8cf599..9d0b7da 100644 --- a/codeanalyzer/config/taint_config_defaults.py +++ b/codeanalyzer/config/taint_config_defaults.py @@ -19,12 +19,17 @@ Design ------ The generated CodeQL query uses CodeQL's built-in security models as the -primary detection layer (``RemoteFlowSource``, ``SqlInjection::Sink``, -``CommandInjection::Sink``, ``CodeInjection::Sink``, ``PathTraversal::Sink``, -``XSS::Sink``). These cover hundreds of APIs automatically. +primary detection layer — all 20 ``*Customizations`` modules shipped with +``codeql/python-all 7.x`` are imported, covering: + + SQL Injection, Command Injection, Code Injection, Path Traversal, + Reflected XSS, LDAP Injection, XXE, SSRF, SSTI, Unsafe Deserialization, + Open Redirect, Log Injection, NoSQL Injection, XPath Injection, + Tar/Zip Slip, HTTP Header Injection, Cleartext Storage, Cleartext Logging, + Cookie Injection, Regular Expression Injection (ReDoS). The patterns defined here are **supplementary** — they extend built-in -coverage with sources/sinks that are not modelled by CodeQL out of the box: +coverage with sources that are not modelled by CodeQL's ``RemoteFlowSource``: Sources not in RemoteFlowSource: - ``sys.argv`` — command-line arguments @@ -33,8 +38,10 @@ - ``os.environ.get()`` — environment variables - ``requests.*`` — outbound HTTP responses used as data sources -Sinks not in built-in models (project-specific or less common): - - ``ldap.search()`` — LDAP injection +Sinks: + - The default sinks list is intentionally empty — all common sinks are + covered by the built-in CodeQL models. Add project-specific sinks here + only when they are NOT covered by the built-ins. Sanitizers: - Common HTML/path/command sanitizers that CodeQL may not model as barriers. @@ -111,27 +118,12 @@ def get_default_taint_config() -> TaintAnalysisConfig: ], sinks=[ - # --- Sinks not covered by CodeQL's built-in sink classes --- - - # LDAP Injection (not in CodeQL's standard Python models) - TaintSinkConfig( - name="ldap_search", - description="LDAP search operations", - pattern='API::moduleImport("ldap").getMember("search").getACall()', - sink_type="ldap_query", - vulnerability_type="LDAP Injection", - severity="high", - argument_index=0, - ), - TaintSinkConfig( - name="ldap3_connection_search", - description="ldap3 Connection.search", - pattern='API::moduleImport("ldap3").getMember("Connection").getReturn().getMember("search").getACall()', - sink_type="ldap_query", - vulnerability_type="LDAP Injection", - severity="high", - argument_index=1, - ), + # The built-in CodeQL security models (imported in taint_query_generator.py) cover + # all common sinks: SQL, command, code, path, XSS, LDAP, XXE, SSRF, SSTI, + # deserialization, open redirect, log injection, NoSQL, XPath, tar/zip slip, + # HTTP header injection, cleartext storage/logging, cookie injection, ReDoS. + # + # Add project-specific sinks here only when they are NOT covered by the built-ins. ], sanitizers=[ diff --git a/codeanalyzer/config/taint_config_loader.py b/codeanalyzer/config/taint_config_loader.py index 120f1a1..9a409b0 100644 --- a/codeanalyzer/config/taint_config_loader.py +++ b/codeanalyzer/config/taint_config_loader.py @@ -29,6 +29,7 @@ from codeanalyzer.schema.py_schema import TaintAnalysisConfig from codeanalyzer.config.taint_config_defaults import get_default_taint_config from codeanalyzer.utils import logger +from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator class TaintConfigLoader: @@ -73,8 +74,13 @@ def load_config( # Filter out disabled items config = TaintConfigLoader._filter_disabled(config) - logger.info(f"Final taint configuration: {len(config.sources)} sources, " - f"{len(config.sinks)} sinks, {len(config.sanitizers)} sanitizers") + n_builtin = TaintQueryGenerator.builtin_sink_count() + logger.info( + f"Final taint configuration: {len(config.sources)} sources, " + f"{len(config.sinks)} user-configured sinks " + f"(+{n_builtin} built-in CodeQL sink models always active), " + f"{len(config.sanitizers)} sanitizers" + ) return config diff --git a/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py index 0b985cb..b392e78 100644 --- a/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py +++ b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py @@ -59,6 +59,36 @@ class TaintQueryGenerator: """Generates CodeQL queries from taint analysis configuration.""" + # Built-in CodeQL sink models always included in the generated query, + # regardless of user configuration. Each entry is (module::SinkClass, vulnerability_type). + BUILTIN_SINKS: List[tuple] = [ + ("SqlInjection::Sink", "SQL Injection"), + ("CommandInjection::Sink", "Command Injection"), + ("CodeInjection::Sink", "Code Injection"), + ("PathInjection::Sink", "Path Traversal"), + ("ReflectedXss::Sink", "Cross-Site Scripting (XSS)"), + ("LdapInjection::DnSink", "LDAP Injection"), + ("LdapInjection::FilterSink", "LDAP Injection"), + ("Xxe::Sink", "XML External Entity (XXE)"), + ("ServerSideRequestForgery::Sink", "Server-Side Request Forgery (SSRF)"), + ("TemplateInjection::Sink", "Server-Side Template Injection (SSTI)"), + ("UnsafeDeserialization::Sink", "Unsafe Deserialization"), + ("UrlRedirect::Sink", "Open Redirect"), + ("LogInjection::Sink", "Log Injection"), + ("NoSqlInjection::StringSink", "NoSQL Injection"), + ("NoSqlInjection::DictSink", "NoSQL Injection"), + ("XpathInjection::Sink", "XPath Injection"), + ("TarSlip::Sink", "Tar/Zip Slip"), + ("HttpHeaderInjection::Sink", "HTTP Header Injection"), + ("CookieInjection::Sink", "Cookie Injection"), + ("PolynomialReDoS::Sink", "Regular Expression Injection (ReDoS)"), + ] + + @classmethod + def builtin_sink_count(cls) -> int: + """Number of built-in CodeQL sink models always active in the generated query.""" + return len(cls.BUILTIN_SINKS) + @staticmethod def generate_query(config: TaintAnalysisConfig) -> str: """Generate complete taint analysis CodeQL query from configuration. @@ -116,13 +146,33 @@ def _generate_imports() -> str: security-sink/source classes from codeql/python-all so that the query benefits from CodeQL's comprehensive model library. - Module names verified against codeql/python-all 7.x: - - SqlInjectionCustomizations → module SqlInjection { class Sink } - - CommandInjectionCustomizations → module CommandInjection { class Sink } - - CodeInjectionCustomizations → module CodeInjection { class Sink } - - PathInjectionCustomizations → module PathInjection { class Sink } - - ReflectedXSSCustomizations → module ReflectedXss { class Sink } - - RemoteFlowSources → class RemoteFlowSource + Module names verified against codeql/python-all 7.1.0: + - SqlInjectionCustomizations → module SqlInjection { class Sink } + - CommandInjectionCustomizations → module CommandInjection { class Sink } + - CodeInjectionCustomizations → module CodeInjection { class Sink } + - PathInjectionCustomizations → module PathInjection { class Sink } + - ReflectedXSSCustomizations → module ReflectedXss { class Sink } + - LdapInjectionCustomizations → module LdapInjection { class DnSink, FilterSink } + - XxeCustomizations → module Xxe { class Sink } + - ServerSideRequestForgeryCustomizations → module ServerSideRequestForgery { class Sink } + - TemplateInjectionCustomizations → module TemplateInjection { class Sink } + - UnsafeDeserializationCustomizations → module UnsafeDeserialization { class Sink } + - UrlRedirectCustomizations → module UrlRedirect { class Sink } + - LogInjectionCustomizations → module LogInjection { class Sink } + - NoSqlInjectionCustomizations → module NoSqlInjection { class StringSink, DictSink } + - XpathInjectionCustomizations → module XpathInjection { class Sink } + - TarSlipCustomizations → module TarSlip { class Sink } + - HttpHeaderInjectionCustomizations → module HttpHeaderInjection { class Sink } + - CookieInjectionCustomizations → module CookieInjection { class Sink } + - PolynomialReDoSCustomizations → module PolynomialReDoS { class Sink } + - RemoteFlowSources → class RemoteFlowSource + + NOTE: CleartextStorageCustomizations and CleartextLoggingCustomizations are + intentionally excluded from this unified query. Those modules use SensitiveDataSource + (passwords, PII) as their built-in source, not RemoteFlowSource. Mixing them into a + query that uses general user-input sources produces false positives on every + print()/file.write() that receives user data. They are best used in a dedicated query + with SensitiveDataSource as the source. """ return """import python import semmle.python.dataflow.new.DataFlow @@ -133,6 +183,19 @@ def _generate_imports() -> str: import semmle.python.security.dataflow.CodeInjectionCustomizations import semmle.python.security.dataflow.PathInjectionCustomizations import semmle.python.security.dataflow.ReflectedXSSCustomizations +import semmle.python.security.dataflow.LdapInjectionCustomizations +import semmle.python.security.dataflow.XxeCustomizations +import semmle.python.security.dataflow.ServerSideRequestForgeryCustomizations +import semmle.python.security.dataflow.TemplateInjectionCustomizations +import semmle.python.security.dataflow.UnsafeDeserializationCustomizations +import semmle.python.security.dataflow.UrlRedirectCustomizations +import semmle.python.security.dataflow.LogInjectionCustomizations +import semmle.python.security.dataflow.NoSqlInjectionCustomizations +import semmle.python.security.dataflow.XpathInjectionCustomizations +import semmle.python.security.dataflow.TarSlipCustomizations +import semmle.python.security.dataflow.HttpHeaderInjectionCustomizations +import semmle.python.security.dataflow.CookieInjectionCustomizations +import semmle.python.security.dataflow.PolynomialReDoSCustomizations import semmle.python.dataflow.new.RemoteFlowSources""" # ------------------------------------------------------------------ @@ -154,6 +217,14 @@ def _pattern_to_sink_node(pattern: str, argument_index: int) -> str: return f"{api_node}.getParameter({argument_index}).asSink()" return f"{pattern}.getParameter({argument_index}).asSink()" + @staticmethod + def _pattern_to_default_sink_node(pattern: str) -> str: + """Sink node for patterns without a specific argument index — matches any tainted argument.""" + if pattern.endswith(".getACall()"): + base = pattern[:-len(".getACall()")] + return f"{base}.getACall().getAnArg()" + return f"{pattern}.asSink()" + @staticmethod def _pattern_to_sanitizer_node(pattern: str) -> str: """Convert a pattern string to a DataFlow::Node expression for sanitizers.""" @@ -227,6 +298,66 @@ def _generate_sink_predicate(sinks: List[TaintSinkConfig]) -> str: " // Built-in: Reflected XSS sinks (Flask/Django template rendering, …)", " (node instanceof ReflectedXss::Sink and", " sinkType = \"template_rendering\" and severity = \"high\" and vulnerabilityType = \"Cross-Site Scripting (XSS)\")", + " or", + " // Built-in: LDAP injection — DN component", + " (node instanceof LdapInjection::DnSink and", + " sinkType = \"ldap_query\" and severity = \"high\" and vulnerabilityType = \"LDAP Injection\")", + " or", + " // Built-in: LDAP injection — filter component", + " (node instanceof LdapInjection::FilterSink and", + " sinkType = \"ldap_query\" and severity = \"high\" and vulnerabilityType = \"LDAP Injection\")", + " or", + " // Built-in: XML External Entity (XXE) injection", + " (node instanceof Xxe::Sink and", + " sinkType = \"xml_parsing\" and severity = \"high\" and vulnerabilityType = \"XML External Entity (XXE)\")", + " or", + " // Built-in: Server-Side Request Forgery (SSRF)", + " (node instanceof ServerSideRequestForgery::Sink and", + " sinkType = \"ssrf_request\" and severity = \"high\" and vulnerabilityType = \"Server-Side Request Forgery (SSRF)\")", + " or", + " // Built-in: Server-Side Template Injection (SSTI)", + " (node instanceof TemplateInjection::Sink and", + " sinkType = \"template_rendering\" and severity = \"critical\" and vulnerabilityType = \"Server-Side Template Injection (SSTI)\")", + " or", + " // Built-in: Unsafe Deserialization (pickle, yaml.load, …)", + " (node instanceof UnsafeDeserialization::Sink and", + " sinkType = \"deserialization\" and severity = \"critical\" and vulnerabilityType = \"Unsafe Deserialization\")", + " or", + " // Built-in: Open Redirect", + " (node instanceof UrlRedirect::Sink and", + " sinkType = \"url_redirect\" and severity = \"medium\" and vulnerabilityType = \"Open Redirect\")", + " or", + " // Built-in: Log Injection", + " (node instanceof LogInjection::Sink and", + " sinkType = \"log_output\" and severity = \"medium\" and vulnerabilityType = \"Log Injection\")", + " or", + " // Built-in: NoSQL Injection — string payload", + " (node instanceof NoSqlInjection::StringSink and", + " sinkType = \"nosql_query\" and severity = \"high\" and vulnerabilityType = \"NoSQL Injection\")", + " or", + " // Built-in: NoSQL Injection — dictionary/object payload", + " (node instanceof NoSqlInjection::DictSink and", + " sinkType = \"nosql_query\" and severity = \"high\" and vulnerabilityType = \"NoSQL Injection\")", + " or", + " // Built-in: XPath Injection", + " (node instanceof XpathInjection::Sink and", + " sinkType = \"xpath_query\" and severity = \"high\" and vulnerabilityType = \"XPath Injection\")", + " or", + " // Built-in: Tar/Zip Slip (path traversal via archive extraction)", + " (node instanceof TarSlip::Sink and", + " sinkType = \"file_access\" and severity = \"high\" and vulnerabilityType = \"Tar/Zip Slip\")", + " or", + " // Built-in: HTTP Header Injection", + " (node instanceof HttpHeaderInjection::Sink and", + " sinkType = \"http_header\" and severity = \"medium\" and vulnerabilityType = \"HTTP Header Injection\")", + " or", + " // Built-in: Cookie Injection", + " (node instanceof CookieInjection::Sink and", + " sinkType = \"cookie_write\" and severity = \"medium\" and vulnerabilityType = \"Cookie Injection\")", + " or", + " // Built-in: Regular Expression Injection / Polynomial ReDoS", + " (node instanceof PolynomialReDoS::Sink and", + " sinkType = \"regex_execution\" and severity = \"medium\" and vulnerabilityType = \"Regular Expression Injection (ReDoS)\")", ] for sink in sinks: @@ -236,7 +367,7 @@ def _generate_sink_predicate(sinks: List[TaintSinkConfig]) -> str: if sink.argument_index is not None: node_expr = TaintQueryGenerator._pattern_to_sink_node(sink.pattern, sink.argument_index) else: - node_expr = TaintQueryGenerator._pattern_to_source_node(sink.pattern) + node_expr = TaintQueryGenerator._pattern_to_default_sink_node(sink.pattern) lines.append(" (") lines.append(f" node = {node_expr} and") diff --git a/test/conftest.py b/test/conftest.py index 35043e9..a921c83 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -51,6 +51,9 @@ def single_functionalities__stuff_nested_in_functions() -> Path: "xss": _TAINT_FIXTURES_DIR / "xss_app", "flask": _TAINT_FIXTURES_DIR / "flask_app", "sanitizer": _TAINT_FIXTURES_DIR / "sanitizer_app", + "ssti": _TAINT_FIXTURES_DIR / "ssti_app", + "deserialization": _TAINT_FIXTURES_DIR / "deserialization_app", + "ssrf": _TAINT_FIXTURES_DIR / "ssrf_app", } @@ -163,6 +166,39 @@ def sanitizer_db(codeql_databases): return db +@pytest.fixture(scope="session") +def ssti_db(codeql_databases): + """Session-scoped CodeQL database for SSTI fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("ssti") + if db is None: + pytest.skip("Failed to create SSTI CodeQL database") + return db + + +@pytest.fixture(scope="session") +def deserialization_db(codeql_databases): + """Session-scoped CodeQL database for unsafe deserialization fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("deserialization") + if db is None: + pytest.skip("Failed to create deserialization CodeQL database") + return db + + +@pytest.fixture(scope="session") +def ssrf_db(codeql_databases): + """Session-scoped CodeQL database for SSRF fixture.""" + if codeql_databases is None: + pytest.skip("CodeQL not available") + db = codeql_databases.get("ssrf") + if db is None: + pytest.skip("Failed to create SSRF CodeQL database") + return db + + @pytest.fixture(scope="session") def codeql_packs_dir(tmp_path_factory): """Session-scoped fixture that installs a qlpack with codeql/python-all once. diff --git a/test/fixtures/taint_analysis/deserialization_app/vulnerable.py b/test/fixtures/taint_analysis/deserialization_app/vulnerable.py new file mode 100644 index 0000000..fac32fa --- /dev/null +++ b/test/fixtures/taint_analysis/deserialization_app/vulnerable.py @@ -0,0 +1,52 @@ +""" +Unsafe Deserialization vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import pickle +import sys + + +def load_from_input(): + """VULNERABLE: pickle.loads on user-supplied bytes from stdin.""" + raw = input("Enter serialized data (hex): ") + return pickle.loads(bytes.fromhex(raw)) + + +def load_from_argv(): + """VULNERABLE: pickle.loads on command-line argument.""" + if len(sys.argv) > 1: + return pickle.loads(sys.argv[1].encode("latin-1")) + return None + + +def process_and_load(data): + """Intermediate function — taint propagates through.""" + stripped = data.strip() + return pickle.loads(stripped.encode("latin-1")) + + +def vulnerable_from_input_processed(): + """VULNERABLE: taint flow through intermediate function.""" + raw = input("Payload: ") + return process_and_load(raw) + + +class DataLoader: + def read_payload(self): + """Source: reads from argv.""" + return sys.argv[1] if len(sys.argv) > 1 else b"" + + def deserialize(self, payload): + """Sink: unsafe pickle.loads.""" + return pickle.loads(payload) + + def run(self): + """VULNERABLE: inter-method taint flow.""" + payload = self.read_payload() + return self.deserialize(payload) + + +if __name__ == "__main__": + loader = DataLoader() + loader.run() diff --git a/test/fixtures/taint_analysis/ssrf_app/vulnerable.py b/test/fixtures/taint_analysis/ssrf_app/vulnerable.py new file mode 100644 index 0000000..cd026f6 --- /dev/null +++ b/test/fixtures/taint_analysis/ssrf_app/vulnerable.py @@ -0,0 +1,52 @@ +""" +Server-Side Request Forgery (SSRF) vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import sys +import requests +from flask import Flask, request as flask_request + +app = Flask(__name__) + + +@app.route("/fetch") +def fetch(): + """VULNERABLE: user-controlled URL passed directly to requests.get.""" + url = flask_request.args.get("url") + return requests.get(url).text + + +@app.route("/proxy") +def proxy(): + """VULNERABLE: user-controlled URL in requests.post.""" + target = flask_request.args.get("target") + payload = flask_request.args.get("data", "") + response = requests.post(target, data=payload) + return response.text + + +def fetch_from_argv(): + """VULNERABLE: SSRF from command-line argument.""" + if len(sys.argv) > 1: + url = sys.argv[1] + return requests.get(url).text + return "" + + +def build_url(base, path): + """Intermediate: combines user-controlled parts.""" + return base + "/" + path + + +@app.route("/indirect") +def indirect_ssrf(): + """VULNERABLE: SSRF via URL constructed from user input.""" + base = flask_request.args.get("base", "http://internal") + path = flask_request.args.get("path", "") + url = build_url(base, path) + return requests.get(url).text + + +if __name__ == "__main__": + app.run() diff --git a/test/fixtures/taint_analysis/ssti_app/vulnerable.py b/test/fixtures/taint_analysis/ssti_app/vulnerable.py new file mode 100644 index 0000000..60927a3 --- /dev/null +++ b/test/fixtures/taint_analysis/ssti_app/vulnerable.py @@ -0,0 +1,35 @@ +""" +Server-Side Template Injection (SSTI) vulnerable test application. +This file contains intentionally vulnerable code for testing taint analysis. +""" + +import sys +from flask import Flask, request, render_template_string + +app = Flask(__name__) + + +@app.route("/greet") +def greet(): + """VULNERABLE: user input interpolated directly into a Jinja2 template.""" + name = request.args.get("name", "World") + template = "

Hello, " + name + "!

" + return render_template_string(template) + + +@app.route("/profile") +def profile(): + """VULNERABLE: f-string template construction from query param.""" + username = request.args.get("user", "anonymous") + tmpl = f"

Welcome {username}

" + return render_template_string(tmpl) + + +def render_from_argv(): + """VULNERABLE: template built from command-line argument.""" + payload = sys.argv[1] if len(sys.argv) > 1 else "safe" + return render_template_string("
" + payload + "
") + + +if __name__ == "__main__": + app.run() diff --git a/test/test_taint_analysis.py b/test/test_taint_analysis.py index 9cb15e4..6b48e2c 100644 --- a/test/test_taint_analysis.py +++ b/test/test_taint_analysis.py @@ -57,6 +57,24 @@ def sanitizer_app(): return FIXTURES_DIR / "sanitizer_app" +@pytest.fixture +def ssti_app(): + """Path to SSTI test app.""" + return FIXTURES_DIR / "ssti_app" + + +@pytest.fixture +def deserialization_app(): + """Path to unsafe deserialization test app.""" + return FIXTURES_DIR / "deserialization_app" + + +@pytest.fixture +def ssrf_app(): + """Path to SSRF test app.""" + return FIXTURES_DIR / "ssrf_app" + + @pytest.fixture def default_taint_config(): """Get default taint configuration.""" @@ -73,17 +91,16 @@ class TestTaintAnalysisConfiguration: def test_default_configuration(self, default_taint_config): """Test default taint configuration.""" assert len(default_taint_config.sources) > 0 - assert len(default_taint_config.sinks) > 0 + # Sinks list is intentionally empty — all sinks are covered by CodeQL's built-in + # security models (LdapInjection, Xxe, SSRF, SSTI, UnsafeDeserialization, …) + # imported in the generated query rather than enumerated here. + assert isinstance(default_taint_config.sinks, list) assert len(default_taint_config.sanitizers) > 0 # Verify all sources are enabled by default enabled_sources = [s for s in default_taint_config.sources if s.enabled] assert len(enabled_sources) == len(default_taint_config.sources) - # Verify all sinks are enabled by default - enabled_sinks = [s for s in default_taint_config.sinks if s.enabled] - assert len(enabled_sinks) == len(default_taint_config.sinks) - def test_custom_configuration_yaml(self, sql_injection_app, tmp_path): """Test custom taint configuration from YAML.""" # Create custom config with only SQL injection sinks @@ -149,6 +166,58 @@ def test_config_merge_with_defaults(self, tmp_path): custom_sources = [s for s in config.sources if s.name == "custom_source"] assert len(custom_sources) == 1 + def test_query_contains_all_builtin_imports(self, default_taint_config): + """Generated query must import all 20 CodeQL security customization modules.""" + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + query = TaintQueryGenerator.generate_query(default_taint_config) + expected_modules = [ + "LdapInjectionCustomizations", + "XxeCustomizations", + "ServerSideRequestForgeryCustomizations", + "TemplateInjectionCustomizations", + "UnsafeDeserializationCustomizations", + "UrlRedirectCustomizations", + "LogInjectionCustomizations", + "NoSqlInjectionCustomizations", + "XpathInjectionCustomizations", + "TarSlipCustomizations", + "HttpHeaderInjectionCustomizations", + "CookieInjectionCustomizations", + "PolynomialReDoSCustomizations", + # CleartextStorageCustomizations and CleartextLoggingCustomizations are + # intentionally excluded: they use SensitiveDataSource (not RemoteFlowSource) + # and produce false positives when combined with general user-input sources. + ] + for mod in expected_modules: + assert mod in query, f"Generated query is missing import for {mod}" + + def test_query_contains_all_builtin_sinks(self, default_taint_config): + """Generated query must include instanceof checks for all built-in sink classes.""" + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + query = TaintQueryGenerator.generate_query(default_taint_config) + expected_sinks = [ + "LdapInjection::DnSink", + "LdapInjection::FilterSink", + "Xxe::Sink", + "ServerSideRequestForgery::Sink", + "TemplateInjection::Sink", + "UnsafeDeserialization::Sink", + "UrlRedirect::Sink", + "LogInjection::Sink", + "NoSqlInjection::StringSink", + "NoSqlInjection::DictSink", + "XpathInjection::Sink", + "TarSlip::Sink", + "HttpHeaderInjection::Sink", + "CookieInjection::Sink", + "PolynomialReDoS::Sink", + # CleartextStorage::Sink and CleartextLogging::Sink are intentionally excluded: + # these use SensitiveDataSource internally and produce false positives when + # combined with general user-input sources in a unified query. + ] + for sink in expected_sinks: + assert sink in query, f"Generated query is missing instanceof check for {sink}" + class TestTaintAnalysisPydanticModels: """Tests for Pydantic models used in taint analysis.""" @@ -839,3 +908,83 @@ def test_analysis_level_3_requires_codeql(self, sql_injection_app, tmp_path): assert all(f.severity == "critical" for f in sql_flows), ( "All SQL Injection flows should be critical severity" ) + + +# ============================================================================ +# Integration Tests — New Vulnerability Types (require CodeQL) +# ============================================================================ + +class TestTaintAnalysisNewVulnerabilityTypes: + """Integration tests for vulnerability types added via the expanded built-in CodeQL models.""" + + def test_ssti_detection(self, ssti_db, codeql_packs_dir): + """Server-Side Template Injection must be detected in ssti_app fixture.""" + import shutil + if not shutil.which("codeql"): + pytest.skip("CodeQL not available") + if codeql_packs_dir is None: + pytest.skip("CodeQL packs not available") + + codeql = CodeQL( + project_dir=FIXTURES_DIR / "ssti_app", + db_path=ssti_db, + codeql_packs_dir=codeql_packs_dir, + ) + from codeanalyzer.config.taint_config_defaults import get_default_taint_config as _get_cfg + result = codeql.analyze_taint_flows(config_override=_get_cfg()) + + ssti_flows = [f for f in result.flows if "Template Injection" in f.vulnerability_type] + assert len(ssti_flows) >= 1, ( + f"Expected at least 1 SSTI flow, got {len(ssti_flows)}. " + f"All flows: {[f.vulnerability_type for f in result.flows]}" + ) + assert all(f.severity == "critical" for f in ssti_flows), ( + "All SSTI flows should be critical severity" + ) + + def test_unsafe_deserialization_detection(self, deserialization_db, codeql_packs_dir): + """Unsafe Deserialization must be detected in deserialization_app fixture.""" + import shutil + if not shutil.which("codeql"): + pytest.skip("CodeQL not available") + if codeql_packs_dir is None: + pytest.skip("CodeQL packs not available") + + codeql = CodeQL( + project_dir=FIXTURES_DIR / "deserialization_app", + db_path=deserialization_db, + codeql_packs_dir=codeql_packs_dir, + ) + from codeanalyzer.config.taint_config_defaults import get_default_taint_config as _get_cfg + result = codeql.analyze_taint_flows(config_override=_get_cfg()) + + deser_flows = [f for f in result.flows if "Deserialization" in f.vulnerability_type] + assert len(deser_flows) >= 1, ( + f"Expected at least 1 Unsafe Deserialization flow, got {len(deser_flows)}. " + f"All flows: {[f.vulnerability_type for f in result.flows]}" + ) + assert all(f.severity == "critical" for f in deser_flows), ( + "All Unsafe Deserialization flows should be critical severity" + ) + + def test_ssrf_detection(self, ssrf_db, codeql_packs_dir): + """Server-Side Request Forgery must be detected in ssrf_app fixture.""" + import shutil + if not shutil.which("codeql"): + pytest.skip("CodeQL not available") + if codeql_packs_dir is None: + pytest.skip("CodeQL packs not available") + + codeql = CodeQL( + project_dir=FIXTURES_DIR / "ssrf_app", + db_path=ssrf_db, + codeql_packs_dir=codeql_packs_dir, + ) + from codeanalyzer.config.taint_config_defaults import get_default_taint_config as _get_cfg + result = codeql.analyze_taint_flows(config_override=_get_cfg()) + + ssrf_flows = [f for f in result.flows if "Request Forgery" in f.vulnerability_type] + assert len(ssrf_flows) >= 1, ( + f"Expected at least 1 SSRF flow, got {len(ssrf_flows)}. " + f"All flows: {[f.vulnerability_type for f in result.flows]}" + ) From 509a5419435bf3f722cd8df1cfb99e2cd685d27d Mon Sep 17 00:00:00 2001 From: Saurabh Sinha Date: Wed, 20 May 2026 11:36:04 -0400 Subject: [PATCH 3/4] Improve taint analysis extensibility: fix merge bugs, add disabled_builtin_sinks, three-mode config control, and validation Signed-off-by: Saurabh Sinha --- codeanalyzer/__main__.py | 19 +- codeanalyzer/config/taint_config_loader.py | 58 +++-- codeanalyzer/core.py | 13 +- codeanalyzer/options/options.py | 1 + codeanalyzer/schema/py_schema.py | 10 + .../codeql/taint_query_generator.py | 163 ++++---------- examples/taint-config.example.yaml | 137 ++++++++++++ test/test_taint_analysis.py | 206 ++++++++++++++++++ 8 files changed, 464 insertions(+), 143 deletions(-) create mode 100644 examples/taint-config.example.yaml diff --git a/codeanalyzer/__main__.py b/codeanalyzer/__main__.py index 02b25ae..5d5f65b 100644 --- a/codeanalyzer/__main__.py +++ b/codeanalyzer/__main__.py @@ -41,6 +41,18 @@ def main( help="Path to taint analysis configuration file (YAML or JSON). Used with --analysis-level 3.", ), ] = None, + taint_use_defaults: Annotated[ + bool, + typer.Option( + "--taint-defaults/--no-taint-defaults", + help=( + "Controls which taint sources/sinks/sanitizers are active:\n\n" + " (no --taint-config) → built-in defaults only\n" + " --taint-config + --taint-defaults → union of defaults and custom config [default]\n" + " --taint-config + --no-taint-defaults → custom config only, replaces all defaults" + ), + ), + ] = True, using_ray: Annotated[ bool, typer.Option("--ray/--no-ray", help="Enable Ray for distributed analysis."), @@ -89,10 +101,14 @@ def main( if analysis_level >= 2 and not using_codeql: logger.error("Analysis levels 2 and 3 require --codeql flag") raise typer.Exit(code=1) - + if analysis_level >= 3 and taint_config and not taint_config.exists(): logger.error(f"Taint configuration file '{taint_config}' does not exist.") raise typer.Exit(code=1) + + if not taint_use_defaults and not taint_config: + logger.error("--no-taint-defaults requires --taint-config (otherwise nothing would be analyzed).") + raise typer.Exit(code=1) options = AnalysisOptions( input=input, @@ -108,6 +124,7 @@ def main( clear_cache=clear_cache, verbosity=verbosity, taint_config=taint_config, + taint_use_defaults=taint_use_defaults, ) _set_log_level(options.verbosity) diff --git a/codeanalyzer/config/taint_config_loader.py b/codeanalyzer/config/taint_config_loader.py index 9a409b0..e51455f 100644 --- a/codeanalyzer/config/taint_config_loader.py +++ b/codeanalyzer/config/taint_config_loader.py @@ -56,32 +56,56 @@ def load_config( FileNotFoundError: If config_path is provided but file doesn't exist ValueError: If file format is unsupported or invalid """ + # Log which of the three config modes is active + if config_path and use_defaults: + logger.info(f"Taint config mode: merging '{config_path}' with built-in defaults") + elif config_path: + logger.info(f"Taint config mode: custom only — '{config_path}' (built-in defaults disabled)") + else: + logger.info("Taint config mode: built-in defaults only") + # Start with defaults if requested if use_defaults: config = get_default_taint_config() - logger.debug(f"Loaded default taint configuration with {len(config.sources)} sources, " - f"{len(config.sinks)} sinks, {len(config.sanitizers)} sanitizers") + logger.debug( + f" Defaults loaded: {len(config.sources)} sources, " + f"{len(config.sanitizers)} sanitizers" + ) else: config = TaintAnalysisConfig() - logger.debug("Starting with empty taint configuration") - + # Load and merge custom configuration if config_path: custom_config = TaintConfigLoader._load_from_file(config_path) + logger.debug( + f" Custom file adds: {len(custom_config.sources)} sources, " + f"{len(custom_config.sinks)} sinks, " + f"{len(custom_config.sanitizers)} sanitizers" + ) config = TaintConfigLoader._merge_configs(config, custom_config) - logger.info(f"Merged custom configuration from {config_path}") - + # Filter out disabled items config = TaintConfigLoader._filter_disabled(config) - + + # Warn on any structural problems (missing sources, empty patterns, etc.) + issues = TaintConfigLoader.validate_config(config) + for issue in issues: + logger.warning(f"Taint config: {issue}") + n_builtin = TaintQueryGenerator.builtin_sink_count() + n_disabled = len(config.disabled_builtin_sinks) + active_builtin = n_builtin - n_disabled + builtin_label = ( + f"{active_builtin} of {n_builtin} built-in CodeQL sinks" + if n_disabled + else f"{n_builtin} built-in CodeQL sinks" + ) logger.info( - f"Final taint configuration: {len(config.sources)} sources, " - f"{len(config.sinks)} user-configured sinks " - f"(+{n_builtin} built-in CodeQL sink models always active), " + f"Active taint config: {len(config.sources)} sources, " + f"{len(config.sinks)} user-defined sinks (+{builtin_label}), " f"{len(config.sanitizers)} sanitizers" ) - + return config @staticmethod @@ -185,19 +209,20 @@ def _merge_configs( merged_exclude_files = list(set(base.exclude_files + custom.exclude_files)) merged_exclude_functions = list(set(base.exclude_functions + custom.exclude_functions)) - # Create merged config - # Use custom values for options if they differ from defaults + # Scalar options: custom always wins (it owns those knobs). + # Booleans that are "additive" (enabling features) use OR. return TaintAnalysisConfig( sources=list(base_sources.values()), sinks=list(base_sinks.values()), sanitizers=list(base_sanitizers.values()), - max_path_length=custom.max_path_length if custom.max_path_length != 10 else base.max_path_length, + max_path_length=custom.max_path_length, include_implicit_flows=custom.include_implicit_flows or base.include_implicit_flows, - confidence_threshold=custom.confidence_threshold if custom.confidence_threshold != "medium" else base.confidence_threshold, + confidence_threshold=custom.confidence_threshold, exclude_files=merged_exclude_files, exclude_functions=merged_exclude_functions, include_safe_flows=custom.include_safe_flows or base.include_safe_flows, - group_by_vulnerability=custom.group_by_vulnerability if not custom.group_by_vulnerability else base.group_by_vulnerability, + group_by_vulnerability=custom.group_by_vulnerability, + disabled_builtin_sinks=list(set(base.disabled_builtin_sinks + custom.disabled_builtin_sinks)), ) @staticmethod @@ -234,6 +259,7 @@ def _filter_disabled(config: TaintAnalysisConfig) -> TaintAnalysisConfig: exclude_functions=config.exclude_functions, include_safe_flows=config.include_safe_flows, group_by_vulnerability=config.group_by_vulnerability, + disabled_builtin_sinks=config.disabled_builtin_sinks, ) @staticmethod diff --git a/codeanalyzer/core.py b/codeanalyzer/core.py index 51ccecd..255a861 100644 --- a/codeanalyzer/core.py +++ b/codeanalyzer/core.py @@ -758,23 +758,16 @@ def _perform_taint_analysis(self, symbol_table: Optional[Dict[str, PyModule]] = if not self.db_path: raise ValueError("CodeQL database not available for taint analysis") - # Load taint configuration + # Load taint configuration — load_config logs the mode and active counts + use_defaults = getattr(self.options, "taint_use_defaults", True) if self.options.taint_config: - logger.info(f"Loading taint configuration from {self.options.taint_config}") taint_config = TaintConfigLoader.load_config( self.options.taint_config, - use_defaults=True + use_defaults=use_defaults, ) else: - logger.info("Using default taint analysis configuration") taint_config = TaintConfigLoader.load_config(use_defaults=True) - # Log configuration summary - logger.info(f"Taint analysis configuration:") - logger.info(f" - Sources: {len(taint_config.sources)}") - logger.info(f" - Sinks: {len(taint_config.sinks)}") - logger.info(f" - Sanitizers: {len(taint_config.sanitizers)}") - # Perform analysis codeql = CodeQL( project_dir=self.project_dir, diff --git a/codeanalyzer/options/options.py b/codeanalyzer/options/options.py index e4d32e8..b14033e 100644 --- a/codeanalyzer/options/options.py +++ b/codeanalyzer/options/options.py @@ -24,3 +24,4 @@ class AnalysisOptions: clear_cache: bool = False verbosity: int = 0 taint_config: Optional[Path] = None + taint_use_defaults: bool = True diff --git a/codeanalyzer/schema/py_schema.py b/codeanalyzer/schema/py_schema.py index 6dd004b..832b2b4 100644 --- a/codeanalyzer/schema/py_schema.py +++ b/codeanalyzer/schema/py_schema.py @@ -523,6 +523,16 @@ class TaintAnalysisConfig(BaseModel): group_by_vulnerability: bool = True """When ``True``, results are grouped by vulnerability type in log output.""" + disabled_builtin_sinks: List[str] = [] + """Names of built-in CodeQL sink classes to suppress. + + Each entry must match a ``class::SinkClass`` string from + ``TaintQueryGenerator.BUILTIN_SINKS`` (e.g. ``"PolynomialReDoS::Sink"`` + or ``"CookieInjection::Sink"``). Matching entries are skipped during + query generation so that specific vulnerability types can be excluded + without replacing the entire built-in sink set. + """ + @builder @msgpk diff --git a/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py index b392e78..054fe58 100644 --- a/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py +++ b/codeanalyzer/semantic_analysis/codeql/taint_query_generator.py @@ -51,7 +51,6 @@ from codeanalyzer.schema.py_schema import ( TaintAnalysisConfig, TaintSourceConfig, - TaintSinkConfig, TaintSanitizerConfig, ) @@ -59,29 +58,31 @@ class TaintQueryGenerator: """Generates CodeQL queries from taint analysis configuration.""" - # Built-in CodeQL sink models always included in the generated query, - # regardless of user configuration. Each entry is (module::SinkClass, vulnerability_type). - BUILTIN_SINKS: List[tuple] = [ - ("SqlInjection::Sink", "SQL Injection"), - ("CommandInjection::Sink", "Command Injection"), - ("CodeInjection::Sink", "Code Injection"), - ("PathInjection::Sink", "Path Traversal"), - ("ReflectedXss::Sink", "Cross-Site Scripting (XSS)"), - ("LdapInjection::DnSink", "LDAP Injection"), - ("LdapInjection::FilterSink", "LDAP Injection"), - ("Xxe::Sink", "XML External Entity (XXE)"), - ("ServerSideRequestForgery::Sink", "Server-Side Request Forgery (SSRF)"), - ("TemplateInjection::Sink", "Server-Side Template Injection (SSTI)"), - ("UnsafeDeserialization::Sink", "Unsafe Deserialization"), - ("UrlRedirect::Sink", "Open Redirect"), - ("LogInjection::Sink", "Log Injection"), - ("NoSqlInjection::StringSink", "NoSQL Injection"), - ("NoSqlInjection::DictSink", "NoSQL Injection"), - ("XpathInjection::Sink", "XPath Injection"), - ("TarSlip::Sink", "Tar/Zip Slip"), - ("HttpHeaderInjection::Sink", "HTTP Header Injection"), - ("CookieInjection::Sink", "Cookie Injection"), - ("PolynomialReDoS::Sink", "Regular Expression Injection (ReDoS)"), + # Built-in CodeQL sink models included in the generated query by default. + # Each dict has: class (CodeQL class expression), sink_type, severity, + # vulnerability_type, and comment (used as inline documentation in the query). + # Individual entries can be suppressed via TaintAnalysisConfig.disabled_builtin_sinks. + BUILTIN_SINKS: List[dict] = [ + {"class": "SqlInjection::Sink", "sink_type": "sql_execution", "severity": "critical", "vulnerability_type": "SQL Injection", "comment": "sqlite3, psycopg2, SQLAlchemy, Django ORM raw, …"}, + {"class": "CommandInjection::Sink", "sink_type": "command_execution","severity": "critical", "vulnerability_type": "Command Injection", "comment": "subprocess.*, os.system, os.popen, …"}, + {"class": "CodeInjection::Sink", "sink_type": "code_execution", "severity": "critical", "vulnerability_type": "Code Injection", "comment": "eval, exec, compile, …"}, + {"class": "PathInjection::Sink", "sink_type": "file_access", "severity": "high", "vulnerability_type": "Path Traversal", "comment": "open, os.path.*, pathlib.Path.open, …"}, + {"class": "ReflectedXss::Sink", "sink_type": "template_rendering","severity": "high", "vulnerability_type": "Cross-Site Scripting (XSS)", "comment": "Flask/Django template rendering, …"}, + {"class": "LdapInjection::DnSink", "sink_type": "ldap_query", "severity": "high", "vulnerability_type": "LDAP Injection", "comment": "LDAP DN component"}, + {"class": "LdapInjection::FilterSink", "sink_type": "ldap_query", "severity": "high", "vulnerability_type": "LDAP Injection", "comment": "LDAP filter component"}, + {"class": "Xxe::Sink", "sink_type": "xml_parsing", "severity": "high", "vulnerability_type": "XML External Entity (XXE)", "comment": "XML parsers with external entity expansion"}, + {"class": "ServerSideRequestForgery::Sink", "sink_type": "ssrf_request", "severity": "high", "vulnerability_type": "Server-Side Request Forgery (SSRF)", "comment": "outbound HTTP requests with user-controlled URL"}, + {"class": "TemplateInjection::Sink", "sink_type": "template_rendering","severity": "critical","vulnerability_type": "Server-Side Template Injection (SSTI)", "comment": "render_template_string, Jinja2 Environment.from_string, …"}, + {"class": "UnsafeDeserialization::Sink", "sink_type": "deserialization", "severity": "critical", "vulnerability_type": "Unsafe Deserialization", "comment": "pickle.loads, yaml.load, …"}, + {"class": "UrlRedirect::Sink", "sink_type": "url_redirect", "severity": "medium", "vulnerability_type": "Open Redirect", "comment": "redirect(), HttpResponseRedirect, …"}, + {"class": "LogInjection::Sink", "sink_type": "log_output", "severity": "medium", "vulnerability_type": "Log Injection", "comment": "logging.*, structlog, …"}, + {"class": "NoSqlInjection::StringSink", "sink_type": "nosql_query", "severity": "high", "vulnerability_type": "NoSQL Injection", "comment": "MongoDB/Redis string queries"}, + {"class": "NoSqlInjection::DictSink", "sink_type": "nosql_query", "severity": "high", "vulnerability_type": "NoSQL Injection", "comment": "MongoDB dict/object queries"}, + {"class": "XpathInjection::Sink", "sink_type": "xpath_query", "severity": "high", "vulnerability_type": "XPath Injection", "comment": "lxml, ElementTree XPath expressions"}, + {"class": "TarSlip::Sink", "sink_type": "file_access", "severity": "high", "vulnerability_type": "Tar/Zip Slip", "comment": "tarfile.extract, zipfile.extractall, …"}, + {"class": "HttpHeaderInjection::Sink", "sink_type": "http_header", "severity": "medium", "vulnerability_type": "HTTP Header Injection", "comment": "Response.headers, …"}, + {"class": "CookieInjection::Sink", "sink_type": "cookie_write", "severity": "medium", "vulnerability_type": "Cookie Injection", "comment": "set_cookie, …"}, + {"class": "PolynomialReDoS::Sink", "sink_type": "regex_execution", "severity": "medium", "vulnerability_type": "Regular Expression Injection (ReDoS)", "comment": "re.match/search/fullmatch with user-supplied pattern"}, ] @classmethod @@ -89,6 +90,11 @@ def builtin_sink_count(cls) -> int: """Number of built-in CodeQL sink models always active in the generated query.""" return len(cls.BUILTIN_SINKS) + @classmethod + def builtin_sink_names(cls) -> List[str]: + """All built-in sink class names (usable in ``disabled_builtin_sinks``).""" + return [s["class"] for s in cls.BUILTIN_SINKS] + @staticmethod def generate_query(config: TaintAnalysisConfig) -> str: """Generate complete taint analysis CodeQL query from configuration. @@ -108,7 +114,7 @@ def generate_query(config: TaintAnalysisConfig) -> str: query_parts.append(TaintQueryGenerator._generate_header()) query_parts.append(TaintQueryGenerator._generate_imports()) query_parts.append(TaintQueryGenerator._generate_source_predicate(config.sources)) - query_parts.append(TaintQueryGenerator._generate_sink_predicate(config.sinks)) + query_parts.append(TaintQueryGenerator._generate_sink_predicate(config)) if config.sanitizers: query_parts.append(TaintQueryGenerator._generate_sanitizer_predicate(config.sanitizers)) @@ -262,105 +268,30 @@ def _generate_source_predicate(sources: List[TaintSourceConfig]) -> str: lines.append("}") return "\n".join(lines) - @staticmethod - def _generate_sink_predicate(sinks: List[TaintSinkConfig]) -> str: + @classmethod + def _generate_sink_predicate(cls, config: "TaintAnalysisConfig") -> str: """Generate isSink predicate combining built-in security sinks with any user-configured sinks. - Built-in sink classes from ``codeql/python-all`` cover: - - ``SqlInjection::Sink`` — sqlite3, psycopg2, mysql-connector, - SQLAlchemy, Django ORM raw queries, … - - ``CommandInjection::Sink`` — subprocess.*, os.system, os.popen, … - - ``CodeInjection::Sink`` — eval(), exec(), compile(), … - - ``PathTraversal::Sink`` — open(), os.path.*, pathlib.Path.open(), … - - ``XSS::Sink`` — Flask/Django template rendering, … - - User-configured patterns extend this with project-specific sinks. + Built-in sinks are driven by ``BUILTIN_SINKS``; any whose ``class`` + appears in ``config.disabled_builtin_sinks`` are omitted. + User-configured patterns in ``config.sinks`` are appended afterward. """ + disabled = set(config.disabled_builtin_sinks) + active_builtins = [s for s in cls.BUILTIN_SINKS if s["class"] not in disabled] + lines = [ "predicate isConfiguredSink(DataFlow::Node node, string sinkType, string severity, string vulnerabilityType) {", - " // Built-in: SQL injection sinks (sqlite3, psycopg2, SQLAlchemy, Django ORM raw, …)", - " (node instanceof SqlInjection::Sink and", - " sinkType = \"sql_execution\" and severity = \"critical\" and vulnerabilityType = \"SQL Injection\")", - " or", - " // Built-in: Command injection sinks (subprocess.*, os.system, os.popen, …)", - " (node instanceof CommandInjection::Sink and", - " sinkType = \"command_execution\" and severity = \"critical\" and vulnerabilityType = \"Command Injection\")", - " or", - " // Built-in: Code injection sinks (eval, exec, compile, …)", - " (node instanceof CodeInjection::Sink and", - " sinkType = \"code_execution\" and severity = \"critical\" and vulnerabilityType = \"Code Injection\")", - " or", - " // Built-in: Path injection sinks (open, os.path.*, pathlib.Path.open, …)", - " (node instanceof PathInjection::Sink and", - " sinkType = \"file_access\" and severity = \"high\" and vulnerabilityType = \"Path Traversal\")", - " or", - " // Built-in: Reflected XSS sinks (Flask/Django template rendering, …)", - " (node instanceof ReflectedXss::Sink and", - " sinkType = \"template_rendering\" and severity = \"high\" and vulnerabilityType = \"Cross-Site Scripting (XSS)\")", - " or", - " // Built-in: LDAP injection — DN component", - " (node instanceof LdapInjection::DnSink and", - " sinkType = \"ldap_query\" and severity = \"high\" and vulnerabilityType = \"LDAP Injection\")", - " or", - " // Built-in: LDAP injection — filter component", - " (node instanceof LdapInjection::FilterSink and", - " sinkType = \"ldap_query\" and severity = \"high\" and vulnerabilityType = \"LDAP Injection\")", - " or", - " // Built-in: XML External Entity (XXE) injection", - " (node instanceof Xxe::Sink and", - " sinkType = \"xml_parsing\" and severity = \"high\" and vulnerabilityType = \"XML External Entity (XXE)\")", - " or", - " // Built-in: Server-Side Request Forgery (SSRF)", - " (node instanceof ServerSideRequestForgery::Sink and", - " sinkType = \"ssrf_request\" and severity = \"high\" and vulnerabilityType = \"Server-Side Request Forgery (SSRF)\")", - " or", - " // Built-in: Server-Side Template Injection (SSTI)", - " (node instanceof TemplateInjection::Sink and", - " sinkType = \"template_rendering\" and severity = \"critical\" and vulnerabilityType = \"Server-Side Template Injection (SSTI)\")", - " or", - " // Built-in: Unsafe Deserialization (pickle, yaml.load, …)", - " (node instanceof UnsafeDeserialization::Sink and", - " sinkType = \"deserialization\" and severity = \"critical\" and vulnerabilityType = \"Unsafe Deserialization\")", - " or", - " // Built-in: Open Redirect", - " (node instanceof UrlRedirect::Sink and", - " sinkType = \"url_redirect\" and severity = \"medium\" and vulnerabilityType = \"Open Redirect\")", - " or", - " // Built-in: Log Injection", - " (node instanceof LogInjection::Sink and", - " sinkType = \"log_output\" and severity = \"medium\" and vulnerabilityType = \"Log Injection\")", - " or", - " // Built-in: NoSQL Injection — string payload", - " (node instanceof NoSqlInjection::StringSink and", - " sinkType = \"nosql_query\" and severity = \"high\" and vulnerabilityType = \"NoSQL Injection\")", - " or", - " // Built-in: NoSQL Injection — dictionary/object payload", - " (node instanceof NoSqlInjection::DictSink and", - " sinkType = \"nosql_query\" and severity = \"high\" and vulnerabilityType = \"NoSQL Injection\")", - " or", - " // Built-in: XPath Injection", - " (node instanceof XpathInjection::Sink and", - " sinkType = \"xpath_query\" and severity = \"high\" and vulnerabilityType = \"XPath Injection\")", - " or", - " // Built-in: Tar/Zip Slip (path traversal via archive extraction)", - " (node instanceof TarSlip::Sink and", - " sinkType = \"file_access\" and severity = \"high\" and vulnerabilityType = \"Tar/Zip Slip\")", - " or", - " // Built-in: HTTP Header Injection", - " (node instanceof HttpHeaderInjection::Sink and", - " sinkType = \"http_header\" and severity = \"medium\" and vulnerabilityType = \"HTTP Header Injection\")", - " or", - " // Built-in: Cookie Injection", - " (node instanceof CookieInjection::Sink and", - " sinkType = \"cookie_write\" and severity = \"medium\" and vulnerabilityType = \"Cookie Injection\")", - " or", - " // Built-in: Regular Expression Injection / Polynomial ReDoS", - " (node instanceof PolynomialReDoS::Sink and", - " sinkType = \"regex_execution\" and severity = \"medium\" and vulnerabilityType = \"Regular Expression Injection (ReDoS)\")", ] - for sink in sinks: + for i, sink in enumerate(active_builtins): + if i > 0: + lines.append(" or") + lines.append(f" // Built-in: {sink['vulnerability_type']} ({sink['comment']})") + lines.append(f" (node instanceof {sink['class']} and") + lines.append(f" sinkType = \"{sink['sink_type']}\" and severity = \"{sink['severity']}\" and vulnerabilityType = \"{sink['vulnerability_type']}\")") + + for sink in config.sinks: lines.append(" or") lines.append(f" // User-configured: {sink.description}") diff --git a/examples/taint-config.example.yaml b/examples/taint-config.example.yaml new file mode 100644 index 0000000..b2c23fe --- /dev/null +++ b/examples/taint-config.example.yaml @@ -0,0 +1,137 @@ +# Taint analysis configuration for codeanalyzer --analysis-level 3 +# +# Usage modes (controlled by --taint-defaults / --no-taint-defaults): +# +# Defaults only (no --taint-config): +# codeanalyzer -i ./myproject -a 3 --codeql +# +# Extend defaults with custom config (union): +# codeanalyzer -i ./myproject -a 3 --codeql --taint-config taint-config.yaml +# +# Custom config only (replace all defaults): +# codeanalyzer -i ./myproject -a 3 --codeql --taint-config taint-config.yaml --no-taint-defaults +# +# All three sections (sources, sinks, sanitizers) are optional. +# Omitted sections default to empty lists — the built-in CodeQL models +# (RemoteFlowSource, SqlInjection::Sink, CommandInjection::Sink, …) are +# always active unless explicitly suppressed via disabled_builtin_sinks. + +# --------------------------------------------------------------------------- +# Global options +# --------------------------------------------------------------------------- +max_path_length: 10 # Maximum taint-path steps reported (default: 10) +confidence_threshold: medium # Minimum confidence to include: high | medium | low +group_by_vulnerability: true # Group log output by vulnerability type + +# --------------------------------------------------------------------------- +# Suppress specific built-in CodeQL sink models +# --------------------------------------------------------------------------- +# Useful for noisy or irrelevant vulnerability types. +# Full list: run `python -c "from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator; print(*TaintQueryGenerator.builtin_sink_names(), sep='\n')"` +# +# disabled_builtin_sinks: +# - PolynomialReDoS::Sink # very noisy on regex-heavy codebases +# - CookieInjection::Sink + +# --------------------------------------------------------------------------- +# Additional taint sources (extends or replaces RemoteFlowSource) +# --------------------------------------------------------------------------- +# Pattern must be a valid CodeQL API-graph expression evaluating to a +# DataFlow::Node. Use double quotes inside the pattern — CodeQL does not +# support single-quoted strings. +# +sources: + # Environment variables (e.g. config loaded from os.environ) + - name: env_var + description: "os.environ and os.getenv calls" + pattern: 'API::moduleImport("os").getMember("environ").asSource()' + source_type: environment_variable + enabled: true + + # CLI arguments (sys.argv) + - name: sys_argv + description: "sys.argv command-line arguments" + pattern: 'API::moduleImport("sys").getMember("argv").asSource()' + source_type: cli_argument + enabled: true + + # Standard input + - name: builtin_input + description: "input() built-in" + pattern: 'API::builtin("input").getACall()' + source_type: user_input + enabled: true + + # Disable one of the above without removing it: + # - name: some_source + # ... + # enabled: false + +# --------------------------------------------------------------------------- +# Additional taint sinks (supplements built-in CodeQL sinks) +# --------------------------------------------------------------------------- +# Built-in sinks (SQL, command, path traversal, XSS, SSTI, SSRF, …) are +# always active. Add entries here for project-specific APIs not covered +# by CodeQL's model library. +# +sinks: + # Project-specific DB wrapper + - name: custom_db_execute + description: "Internal db.execute() wrapper" + pattern: 'API::moduleImport("myapp.db").getMember("execute").getACall()' + sink_type: sql_execution + vulnerability_type: SQL Injection + severity: critical + argument_index: 0 # Only the first argument (the query string) is the sink + + # Custom HTTP client + - name: internal_http_get + description: "Internal HTTP client get()" + pattern: 'API::moduleImport("myapp.http").getMember("get").getACall()' + sink_type: ssrf_request + vulnerability_type: Server-Side Request Forgery (SSRF) + severity: high + # argument_index omitted → any tainted argument triggers the sink + +# --------------------------------------------------------------------------- +# Sanitizers (blocks taint propagation through the matching node) +# --------------------------------------------------------------------------- +# All enabled sanitizers unconditionally block all taint flows passing +# through them. The `sanitizes` list is informational documentation only. +# +sanitizers: + # HTML escaping + - name: html_escape + description: "html.escape() neutralises XSS" + pattern: 'API::moduleImport("html").getMember("escape").getACall()' + sanitizes: [xss, template_injection] + enabled: true + + # Shell quoting + - name: shlex_quote + description: "shlex.quote() neutralises command injection" + pattern: 'API::moduleImport("shlex").getMember("quote").getACall()' + sanitizes: [command_injection] + enabled: true + + # SQL parameter binding via parameterised query (cursor.execute with args) + # Note: prefer argument_index on the sink side for this; sanitizer shown + # here is illustrative. + - name: bleach_clean + description: "bleach.clean() HTML sanitiser" + pattern: 'API::moduleImport("bleach").getMember("clean").getACall()' + sanitizes: [xss] + enabled: true + +# --------------------------------------------------------------------------- +# Exclusions +# --------------------------------------------------------------------------- +# exclude_files: glob patterns relative to the project root +# exclude_functions: fully-qualified function names to exclude as sources/sinks +# +# exclude_files: +# - "tests/**" +# - "**/*_test.py" +# +# exclude_functions: +# - myapp.utils.sanitize_input diff --git a/test/test_taint_analysis.py b/test/test_taint_analysis.py index 6b48e2c..853ac69 100644 --- a/test/test_taint_analysis.py +++ b/test/test_taint_analysis.py @@ -341,6 +341,212 @@ def test_disabled_sources_and_sinks(self, sql_injection_app, tmp_path): assert len(filtered_config.sinks) == 0 +# ============================================================================ +# Extensibility mechanism unit tests (no CodeQL required) +# ============================================================================ + +class TestTaintConfigExtensibility: + """Tests for the taint config extensibility mechanism: merge, disabled sinks, + use_defaults, and validate_config integration.""" + + # ------------------------------------------------------------------ + # Scalar merge correctness + # ------------------------------------------------------------------ + + def test_merge_scalars_custom_wins(self): + """Custom config scalars always override base — was broken before fix.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + base = TaintAnalysisConfig(max_path_length=15, group_by_vulnerability=False, confidence_threshold="low") + custom = TaintAnalysisConfig(max_path_length=5, group_by_vulnerability=True, confidence_threshold="high") + merged = TaintConfigLoader._merge_configs(base, custom) + assert merged.max_path_length == 5 + assert merged.group_by_vulnerability is True + assert merged.confidence_threshold == "high" + + def test_merge_scalars_custom_default_value_still_wins(self): + """Custom config with value == schema default (e.g. max_path_length=10) must win. + Previously a sentinel comparison '!= 10' silently ignored this case.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + base = TaintAnalysisConfig(max_path_length=20, confidence_threshold="low") + custom = TaintAnalysisConfig(max_path_length=10, confidence_threshold="medium") + merged = TaintConfigLoader._merge_configs(base, custom) + assert merged.max_path_length == 10, "max_path_length=10 must not be silently discarded" + assert merged.confidence_threshold == "medium", "confidence_threshold='medium' must not be silently discarded" + + def test_merge_additive_booleans(self): + """include_implicit_flows and include_safe_flows use OR (enabling is additive).""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + base = TaintAnalysisConfig(include_implicit_flows=True, include_safe_flows=False) + custom = TaintAnalysisConfig(include_implicit_flows=False, include_safe_flows=True) + merged = TaintConfigLoader._merge_configs(base, custom) + assert merged.include_implicit_flows is True # OR(True, False) + assert merged.include_safe_flows is True # OR(False, True) + + def test_merge_exclude_lists_combined(self): + """exclude_files and exclude_functions are unioned across base and custom.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + base = TaintAnalysisConfig(exclude_files=["tests/**"], exclude_functions=["myapp.utils.safe"]) + custom = TaintAnalysisConfig(exclude_files=["vendor/**"], exclude_functions=["myapp.debug.dump"]) + merged = TaintConfigLoader._merge_configs(base, custom) + assert "tests/**" in merged.exclude_files + assert "vendor/**" in merged.exclude_files + assert "myapp.utils.safe" in merged.exclude_functions + assert "myapp.debug.dump" in merged.exclude_functions + + # ------------------------------------------------------------------ + # disabled_builtin_sinks + # ------------------------------------------------------------------ + + def test_disabled_builtin_sinks_removes_from_query(self): + """Sinks listed in disabled_builtin_sinks must not appear in generated query.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + config = TaintAnalysisConfig(disabled_builtin_sinks=["PolynomialReDoS::Sink", "CookieInjection::Sink"]) + query = TaintQueryGenerator.generate_query(config) + assert "PolynomialReDoS::Sink" not in query + assert "CookieInjection::Sink" not in query + assert "SqlInjection::Sink" in query # others remain + + def test_disabled_builtin_sinks_empty_keeps_all(self): + """Empty disabled_builtin_sinks list keeps all 20 built-in sinks in query.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + config = TaintAnalysisConfig() + query = TaintQueryGenerator.generate_query(config) + for name in TaintQueryGenerator.builtin_sink_names(): + assert name in query, f"Expected {name} in query with no disabled sinks" + + def test_disabled_builtin_sinks_merged_from_both_sides(self): + """disabled_builtin_sinks from base and custom are unioned on merge.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + base = TaintAnalysisConfig(disabled_builtin_sinks=["CookieInjection::Sink"]) + custom = TaintAnalysisConfig(disabled_builtin_sinks=["PolynomialReDoS::Sink"]) + merged = TaintConfigLoader._merge_configs(base, custom) + assert "CookieInjection::Sink" in merged.disabled_builtin_sinks + assert "PolynomialReDoS::Sink" in merged.disabled_builtin_sinks + + def test_disabled_builtin_sinks_survives_filter_disabled(self): + """_filter_disabled must carry disabled_builtin_sinks through unchanged.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + config = TaintAnalysisConfig(disabled_builtin_sinks=["TarSlip::Sink"]) + filtered = TaintConfigLoader._filter_disabled(config) + assert "TarSlip::Sink" in filtered.disabled_builtin_sinks + + def test_disabled_builtin_sinks_from_yaml(self, tmp_path): + """disabled_builtin_sinks loaded from YAML file is honoured in query.""" + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + yaml_content = """ +disabled_builtin_sinks: + - PolynomialReDoS::Sink + - HttpHeaderInjection::Sink +sources: [] +sinks: [] +sanitizers: [] +""" + config_file = tmp_path / "cfg.yaml" + config_file.write_text(yaml_content) + config = TaintConfigLoader.load_config(config_file, use_defaults=False) + assert "PolynomialReDoS::Sink" in config.disabled_builtin_sinks + query = TaintQueryGenerator.generate_query(config) + assert "PolynomialReDoS::Sink" not in query + assert "HttpHeaderInjection::Sink" not in query + + # ------------------------------------------------------------------ + # use_defaults flag / three modes + # ------------------------------------------------------------------ + + def test_use_defaults_false_no_custom_gives_empty_config(self): + """use_defaults=False with no config_path produces empty sources/sinks/sanitizers.""" + config = TaintConfigLoader.load_config(use_defaults=False) + assert len(config.sources) == 0 + assert len(config.sinks) == 0 + assert len(config.sanitizers) == 0 + + def test_use_defaults_true_gives_default_sources(self): + """use_defaults=True (default) loads default sources and sanitizers.""" + config = TaintConfigLoader.load_config(use_defaults=True) + assert len(config.sources) > 0 + assert len(config.sanitizers) > 0 + + def test_use_defaults_false_with_custom_config_is_custom_only(self, tmp_path): + """Mode 2: --no-taint-defaults → only custom sources/sinks, no defaults.""" + yaml_content = """ +sources: + - name: only_source + description: "Only this source" + pattern: 'API::builtin("input").getACall()' + source_type: user_input + enabled: true +sinks: [] +sanitizers: [] +""" + config_file = tmp_path / "custom_only.yaml" + config_file.write_text(yaml_content) + config = TaintConfigLoader.load_config(config_file, use_defaults=False) + assert len(config.sources) == 1 + assert config.sources[0].name == "only_source" + + def test_use_defaults_true_with_custom_config_is_union(self, tmp_path): + """Mode 3: --taint-defaults + --taint-config → union of defaults and custom.""" + yaml_content = """ +sources: + - name: extra_source + description: "Additional source" + pattern: 'API::builtin("input").getACall()' + source_type: user_input + enabled: true +sinks: [] +sanitizers: [] +""" + config_file = tmp_path / "extra.yaml" + config_file.write_text(yaml_content) + config = TaintConfigLoader.load_config(config_file, use_defaults=True) + names = [s.name for s in config.sources] + assert "extra_source" in names + assert len(config.sources) > 1 # defaults present too + + # ------------------------------------------------------------------ + # validate_config integration + # ------------------------------------------------------------------ + + def test_validate_config_warns_no_sources(self): + """validate_config returns an issue when no sources are configured.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig + config = TaintAnalysisConfig(sources=[], sinks=[], sanitizers=[]) + issues = TaintConfigLoader.validate_config(config) + assert any("No taint sources" in i for i in issues) + + def test_validate_config_returns_issues_for_empty_pattern(self): + """validate_config catches empty pattern strings.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig, TaintSourceConfig + config = TaintAnalysisConfig( + sources=[TaintSourceConfig(name="bad", description="d", pattern=" ", source_type="t")] + ) + issues = TaintConfigLoader.validate_config(config) + assert any("Empty pattern" in i for i in issues) + + def test_validate_config_returns_issues_for_duplicates(self): + """validate_config catches duplicate source names.""" + from codeanalyzer.schema.py_schema import TaintAnalysisConfig, TaintSourceConfig + src = TaintSourceConfig(name="dup", description="d", pattern="API::builtin(\"x\")", source_type="t") + config = TaintAnalysisConfig(sources=[src, src]) + issues = TaintConfigLoader.validate_config(config) + assert any("Duplicate" in i for i in issues) + + # ------------------------------------------------------------------ + # builtin_sink_names helper + # ------------------------------------------------------------------ + + def test_builtin_sink_names_complete(self): + """builtin_sink_names() returns exactly 20 entries matching BUILTIN_SINKS.""" + from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + names = TaintQueryGenerator.builtin_sink_names() + assert len(names) == TaintQueryGenerator.builtin_sink_count() + assert "SqlInjection::Sink" in names + assert "UnsafeDeserialization::Sink" in names + assert "TemplateInjection::Sink" in names + + # ============================================================================ # Integration Tests (require CodeQL databases) # ============================================================================ From d0d15689cf6039652bc0e1c6897d3203e1147540 Mon Sep 17 00:00:00 2001 From: Saurabh Sinha Date: Wed, 20 May 2026 16:54:39 -0400 Subject: [PATCH 4/4] Add test case with taint config in json format; add user guide Signed-off-by: Saurabh Sinha --- docs/TAINT_ANALYSIS_USER_GUIDE.md | 504 ++++++++++++++++++++++++++++++ test/test_taint_analysis.py | 37 +++ 2 files changed, 541 insertions(+) create mode 100644 docs/TAINT_ANALYSIS_USER_GUIDE.md diff --git a/docs/TAINT_ANALYSIS_USER_GUIDE.md b/docs/TAINT_ANALYSIS_USER_GUIDE.md new file mode 100644 index 0000000..6bdb8d9 --- /dev/null +++ b/docs/TAINT_ANALYSIS_USER_GUIDE.md @@ -0,0 +1,504 @@ +# Taint Analysis User Guide + +Taint analysis (analysis level 3) tracks untrusted data from entry points +(**sources**) through the application to dangerous call sites (**sinks**), +reporting each path as a security vulnerability. It is powered by CodeQL and +requires the CodeQL CLI to be installed. + +--- + +## Table of Contents + +1. [Quick start](#quick-start) +2. [How it works](#how-it-works) +3. [Built-in coverage](#built-in-coverage) +4. [Configuration modes](#configuration-modes) +5. [Configuration file reference](#configuration-file-reference) +6. [Writing patterns](#writing-patterns) +7. [Output format](#output-format) +8. [Programmatic API](#programmatic-api) +9. [Troubleshooting](#troubleshooting) + +--- + +## Quick start + +```bash +# Analyse a project with all built-in defaults +codeanalyzer -i ./myproject -a 3 --codeql + +# Extend defaults with project-specific sources/sinks +codeanalyzer -i ./myproject -a 3 --codeql --taint-config taint.yaml + +# Use only your own config, no built-in defaults +codeanalyzer -i ./myproject -a 3 --codeql --taint-config taint.yaml --no-taint-defaults +``` + +--- + +## How it works + +The analysis generates a CodeQL query from three layers: + +1. **Built-in sources** — CodeQL's `RemoteFlowSource` class, which + automatically recognises all web-framework request inputs (Flask, Django, + FastAPI, aiohttp, Tornado, …) without any manual configuration. + +2. **Supplementary sources** — Additional sources provided by the default + configuration or your custom config file (e.g. `sys.argv`, `input()`, + environment variables). + +3. **Sinks** — Two complementary layers: + - *Built-in CodeQL sinks* — 20 vulnerability-specific sink classes + (SQL, command injection, path traversal, XSS, SSRF, SSTI, …) that + cover hundreds of framework APIs automatically. These are **always + active** unless explicitly suppressed with `disabled_builtin_sinks`. + - *User-defined sinks* — Project-specific APIs added via config file. + +4. **Sanitizers** — Call sites that block taint propagation (HTML escape, + shell quoting, path normalisation, …). + +--- + +## Built-in coverage + +### Default sources (always active) + +| Name | What it matches | Source type | +|---|---|---| +| `RemoteFlowSource` (CodeQL) | All web-framework request inputs | `web_request` | +| `command_line_args` | `sys.argv` | `command_line_argument` | +| `user_input` | `input()` | `user_input` | +| `env_getenv` | `os.getenv()` | `environment_variable` | +| `env_environ_get` | `os.environ.get()` | `environment_variable` | +| `requests_get_response` | `requests.get().text` | `http_response` | +| `requests_post_response` | `requests.post().text` | `http_response` | + +### Built-in sinks (always active, 20 total) + +| CodeQL class | Vulnerability type | Severity | +|---|---|---| +| `SqlInjection::Sink` | SQL Injection | critical | +| `CommandInjection::Sink` | Command Injection | critical | +| `CodeInjection::Sink` | Code Injection | critical | +| `TemplateInjection::Sink` | Server-Side Template Injection (SSTI) | critical | +| `UnsafeDeserialization::Sink` | Unsafe Deserialization | critical | +| `PathInjection::Sink` | Path Traversal | high | +| `ReflectedXss::Sink` | Cross-Site Scripting (XSS) | high | +| `LdapInjection::DnSink` | LDAP Injection | high | +| `LdapInjection::FilterSink` | LDAP Injection | high | +| `Xxe::Sink` | XML External Entity (XXE) | high | +| `ServerSideRequestForgery::Sink` | Server-Side Request Forgery (SSRF) | high | +| `NoSqlInjection::StringSink` | NoSQL Injection | high | +| `NoSqlInjection::DictSink` | NoSQL Injection | high | +| `XpathInjection::Sink` | XPath Injection | high | +| `TarSlip::Sink` | Tar/Zip Slip | high | +| `UrlRedirect::Sink` | Open Redirect | medium | +| `LogInjection::Sink` | Log Injection | medium | +| `HttpHeaderInjection::Sink` | HTTP Header Injection | medium | +| `CookieInjection::Sink` | Cookie Injection | medium | +| `PolynomialReDoS::Sink` | Regular Expression Injection (ReDoS) | medium | + +### Default sanitizers (always active) + +| Name | What it matches | +|---|---| +| `html_escape` | `html.escape()` | +| `markupsafe_escape` | `markupsafe.escape()` | +| `shlex_quote` | `shlex.quote()` | +| `os_path_normpath` | `os.path.normpath()` | +| `os_path_abspath` | `os.path.abspath()` | +| `pathlib_resolve` | `pathlib.Path.resolve()` | + +--- + +## Configuration modes + +| Invocation | What is active | +|---|---| +| No `--taint-config` | Built-in defaults only | +| `--taint-config file.yaml` | Defaults **extended** with `file.yaml` (union) | +| `--taint-config file.yaml --no-taint-defaults` | `file.yaml` only, no defaults | + +The third mode lets you constrain the analysis to a specific set of +sources/sinks — for example, when tuning for a particular project or auditing +a single vulnerability class. + +--- + +## Configuration file reference + +Configuration files can be YAML (`.yaml` / `.yml`) or JSON (`.json`). +All three top-level sections are optional; omit any section to inherit the +defaults for it (when `--taint-defaults` is active). + +```yaml +# Optional global settings +max_path_length: 10 # Maximum taint-path steps (default: 10) +confidence_threshold: medium # high | medium | low (default: medium) +group_by_vulnerability: true # Group log output by type (default: true) + +# Suppress specific built-in CodeQL sinks (see list above) +disabled_builtin_sinks: [] + +# Exclude files / functions from analysis +exclude_files: [] # Glob patterns relative to project root +exclude_functions: [] # Fully-qualified function names + +# Additional sources, sinks, sanitizers (see sections below) +sources: [] +sinks: [] +sanitizers: [] +``` + +### `sources[]` + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | string | yes | Unique identifier used in logs and deduplication | +| `description` | string | yes | Human-readable explanation | +| `pattern` | string | yes | CodeQL API-graph expression (see [Writing patterns](#writing-patterns)) | +| `source_type` | string | yes | Label propagated to `PyTaintSource.source_type` in results | +| `enabled` | bool | no | Default `true`; set `false` to temporarily disable | + +```yaml +sources: + - name: redis_get + description: "Values retrieved from Redis" + pattern: 'API::moduleImport("redis").getMember("Redis").getInstance().getMember("get").getReturn()' + source_type: cache_read +``` + +### `sinks[]` + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | string | yes | Unique identifier | +| `description` | string | yes | Human-readable explanation | +| `pattern` | string | yes | CodeQL API-graph expression | +| `sink_type` | string | yes | Label propagated to `PyTaintSink.sink_type` in results | +| `vulnerability_type` | string | yes | Vulnerability name reported in results | +| `severity` | string | yes | `critical` \| `high` \| `medium` \| `low` | +| `argument_index` | int | no | Zero-based index of the dangerous argument. When omitted, any tainted argument triggers the sink. | +| `enabled` | bool | no | Default `true` | + +```yaml +sinks: + - name: internal_db_query + description: "Internal database wrapper" + pattern: 'API::moduleImport("myapp.db").getMember("query").getACall()' + sink_type: sql_execution + vulnerability_type: SQL Injection + severity: critical + argument_index: 0 # Only the first argument (the query string) matters +``` + +Use `argument_index` to avoid false positives when only one specific argument +of a multi-argument call is dangerous. For example, `cursor.execute(query, +params)` — only `query` (index `0`) should be treated as the sink, not +`params`. + +### `sanitizers[]` + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | string | yes | Unique identifier | +| `description` | string | yes | Human-readable explanation | +| `pattern` | string | yes | CodeQL API-graph expression | +| `sanitizes` | list[string] | no | Informational list of mitigated vulnerability types (not used by the query engine) | +| `enabled` | bool | no | Default `true` | + +```yaml +sanitizers: + - name: bleach_clean + description: "bleach.clean() HTML sanitiser" + pattern: 'API::moduleImport("bleach").getMember("clean").getACall()' + sanitizes: [xss] +``` + +> **Note:** All enabled sanitizers unconditionally block **all** taint flows +> passing through them. The `sanitizes` field is documentation only; per-flow +> sanitisation (blocking only XSS flows, not command injection flows) is not +> yet supported. + +### `disabled_builtin_sinks` + +Suppress specific built-in CodeQL sink models without removing the rest: + +```yaml +disabled_builtin_sinks: + - PolynomialReDoS::Sink # too noisy on regex-heavy codebases + - CookieInjection::Sink +``` + +To list all available names at runtime: + +```bash +python -c " +from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator +print(*TaintQueryGenerator.builtin_sink_names(), sep='\n') +" +``` + +### Merge behaviour when `--taint-defaults` is active + +When a custom config is merged with the defaults: + +| Item | Behaviour | +|---|---| +| Sources | Union; custom entry with the same `name` **overrides** the default | +| Sinks | Union; custom entry with the same `name` overrides the default | +| Sanitizers | Union; same override rule | +| `disabled_builtin_sinks` | Union of both lists | +| `exclude_files` / `exclude_functions` | Union of both lists | +| Scalar options (`max_path_length`, `confidence_threshold`, etc.) | Custom value wins | +| Additive booleans (`include_implicit_flows`, `include_safe_flows`) | `OR` — enabling in either config enables globally | + +--- + +## Writing patterns + +Patterns are [CodeQL API-graph](https://codeql.github.com/docs/codeql-language-guides/using-the-api-graph-in-python/) +expressions. All string literals inside a pattern **must use double quotes** +(CodeQL does not support single-quoted strings). + +### Common building blocks + +| Goal | Pattern | +|---|---| +| Module-level function call | `API::moduleImport("os").getMember("system").getACall()` | +| Nested attribute call | `API::moduleImport("os").getMember("path").getMember("join").getACall()` | +| Return value of a call | `API::moduleImport("requests").getMember("get").getReturn()` | +| Attribute of a return value | `API::moduleImport("requests").getMember("get").getReturn().getMember("text")` | +| Built-in function | `API::builtin("input").getACall()` | +| Class instance method | `API::moduleImport("sqlite3").getMember("connect").getReturn().getMember("cursor").getReturn().getMember("execute").getACall()` | + +### Source patterns + +For sources, the pattern should resolve to the **return value** of the call +(where the untrusted data lives): + +```yaml +# input() return value +pattern: 'API::builtin("input").getACall()' + +# Flask request argument +pattern: 'API::moduleImport("flask").getMember("request").getMember("args").getMember("get").getACall()' + +# Environment variable +pattern: 'API::moduleImport("os").getMember("getenv").getACall()' +``` + +### Sink patterns + +For sinks, the pattern should resolve to the **argument** that carries the +dangerous value. Use `argument_index` to target a specific argument, or omit +it to flag any tainted argument: + +```yaml +# Target argument 0 of cursor.execute(query, params) +pattern: 'API::moduleImport("sqlite3").getMember("connect").getReturn().getMember("cursor").getReturn().getMember("execute").getACall()' +argument_index: 0 + +# Flag any tainted argument (omit argument_index) +pattern: 'API::moduleImport("myapp.shell").getMember("run").getACall()' +``` + +### Sanitizer patterns + +Sanitizer patterns resolve to the **call that produces the safe value**: + +```yaml +pattern: 'API::moduleImport("html").getMember("escape").getACall()' +``` + +--- + +## Output format + +Results are returned as `PyTaintAnalysisResult` (accessible via the library +API or serialised to JSON/msgpack). Each detected flow has this structure: + +```json +{ + "flows": [ + { + "flow_id": "path/to/app.py:10->path/to/app.py:18", + "vulnerability_type": "SQL Injection", + "severity": "critical", + "confidence": "medium", + "source": { + "source_type": "user_input", + "description": "Direct user input via input() function", + "call_site": { + "method_name": "input", + "file_path": "app.py", + "start_line": 10, + "end_line": 10, + "start_column": 8, + "end_column": 15 + } + }, + "sink": { + "sink_type": "sql_execution", + "description": "SQL Injection", + "severity": "critical", + "call_site": { + "method_name": "execute", + "file_path": "app.py", + "start_line": 18, + "end_line": 18, + "start_column": 4, + "end_column": 22 + } + }, + "path": [ + { + "location": "app.py:10:8", + "function_name": "get_user", + "description": "Source node", + "step_type": "source" + }, + { + "location": "app.py:18:4", + "function_name": "query_db", + "description": "Sink node", + "step_type": "sink" + } + ] + } + ] +} +``` + +**Severity levels:** + +| Severity | Meaning | +|---|---| +| `critical` | Immediate exploitation likely (SQL/command/code/SSTI/deserialization) | +| `high` | High exploitability (path traversal, XSS, SSRF, XXE, LDAP, NoSQL, …) | +| `medium` | Exploitable under specific conditions (redirect, header injection, ReDoS, …) | +| `low` | Informational / low-impact | + +--- + +## Programmatic API + +### Running analysis + +```python +from pathlib import Path +from codeanalyzer.core import Codeanalyzer +from codeanalyzer.options import AnalysisOptions + +options = AnalysisOptions( + input=Path("/path/to/project"), + analysis_level=3, + using_codeql=True, + taint_config=Path("taint.yaml"), # optional + taint_use_defaults=True, # False = custom only +) + +with Codeanalyzer(options) as analyzer: + result = analyzer.analyze() + +taint = result.taint_analysis +print(f"{len(taint.flows)} flows detected") + +for flow in taint.flows: + print(f"[{flow.severity}] {flow.vulnerability_type}") + print(f" source: {flow.source.call_site.file_path}:{flow.source.call_site.start_line}") + print(f" sink: {flow.sink.call_site.file_path}:{flow.sink.call_site.start_line}") +``` + +### Loading and inspecting configuration + +```python +from codeanalyzer.config.taint_config_loader import TaintConfigLoader +from codeanalyzer.config.taint_config_defaults import get_default_taint_config +from codeanalyzer.semantic_analysis.codeql.taint_query_generator import TaintQueryGenerator + +# Load defaults only +config = TaintConfigLoader.load_config() + +# Load custom file, merged with defaults (mode 2) +config = TaintConfigLoader.load_config("taint.yaml", use_defaults=True) + +# Load custom file only (mode 3) +config = TaintConfigLoader.load_config("taint.yaml", use_defaults=False) + +# Inspect what is active +print(f"Sources: {len(config.sources)}") +print(f"User sinks:{len(config.sinks)}") +print(f"Built-in sinks: {TaintQueryGenerator.builtin_sink_count()}") +print(f"Disabled built-ins: {config.disabled_builtin_sinks}") +print(f"Sanitizers:{len(config.sanitizers)}") + +# All available built-in sink names (for use in disabled_builtin_sinks) +print(TaintQueryGenerator.builtin_sink_names()) + +# Validate a config and check for problems +issues = TaintConfigLoader.validate_config(config) +for issue in issues: + print(f"WARNING: {issue}") + +# Save current effective config to file (useful for debugging) +TaintConfigLoader.save_config(config, "effective-config.yaml", format="yaml") +``` + +--- + +## Troubleshooting + +### No flows detected + +1. **Check verbosity** — run with `-vv` to see the active config summary and + which sources/sinks are loaded. +2. **Verify source coverage** — your code may use a web framework already + covered by `RemoteFlowSource`, or it may use a non-web input not in the + defaults. Add a custom source for the latter. +3. **Check sanitizers** — a flow that is blocked by a default sanitizer + (e.g. `html.escape`, `shlex.quote`) will not be reported. Set + `include_safe_flows: true` temporarily to see sanitised paths. +4. **Check for excluded files** — if `exclude_files` or `exclude_functions` + is set in a config, those paths are silently skipped. +5. **Confirm CodeQL database** — the CodeQL database is built from the project + at analysis time. If the database is stale, use `--eager` to rebuild. + +### Too many false positives + +- Use `disabled_builtin_sinks` to suppress noisy sink classes (e.g. + `PolynomialReDoS::Sink` on regex-heavy codebases). +- Use `--no-taint-defaults` with a hand-crafted config file to constrain + analysis to only the flows you care about. +- Use `exclude_files` to skip test or vendor directories. +- Add sanitizer entries for project-specific validation functions. + +### Unexpected flows blocked (false negatives) + +- Check that the sanitizer pattern actually matches your code — test it by + temporarily disabling the sanitizer with `enabled: false`. +- CodeQL sanitizers are applied globally. If a sanitizer is too broad (e.g. + `os.path.normpath` blocking a non-path flow), disable it and add a narrower + one. + +### Config file not loading + +- Verify patterns use **double quotes** inside the YAML string. Single quotes + are a CodeQL syntax error. +- Run `validate_config()` programmatically (see above) to catch empty + patterns, duplicate names, or missing required fields. +- Check the log output at `-v` level — a `WARNING: Taint config: …` line + indicates a structural problem found at load time. + +### Getting the CodeQL CLI + +Taint analysis requires the [CodeQL CLI](https://github.com/github/codeql-cli-binaries/releases). +Download the archive for your platform, unpack it, and ensure the `codeql` +binary is on your `PATH`: + +```bash +codeql --version # should print the CodeQL version +``` + +The `codeql/python-all` pack is downloaded automatically on first use. diff --git a/test/test_taint_analysis.py b/test/test_taint_analysis.py index 853ac69..dfc6e86 100644 --- a/test/test_taint_analysis.py +++ b/test/test_taint_analysis.py @@ -141,6 +141,43 @@ def test_custom_configuration_yaml(self, sql_injection_app, tmp_path): assert config.sources[0].name == "user_input" assert config.sinks[0].vulnerability_type == "SQL Injection" + def test_custom_configuration_json(self, tmp_path): + """Test custom taint configuration loaded from a JSON file.""" + import json + config_data = { + "sources": [ + { + "name": "user_input", + "description": "User input from input() function", + "pattern": 'API::builtin("input").getACall()', + "source_type": "user_input", + "enabled": True, + } + ], + "sinks": [ + { + "name": "sql_execute", + "description": "SQL query execution", + "pattern": 'API::moduleImport("sqlite3").getMember("execute").getACall()', + "sink_type": "sql_execution", + "vulnerability_type": "SQL Injection", + "severity": "critical", + "enabled": True, + } + ], + "sanitizers": [], + } + config_file = tmp_path / "custom_taint_config.json" + config_file.write_text(json.dumps(config_data)) + + config = TaintConfigLoader.load_config(config_file, use_defaults=False) + + assert len(config.sources) == 1 + assert len(config.sinks) == 1 + assert len(config.sanitizers) == 0 + assert config.sources[0].name == "user_input" + assert config.sinks[0].vulnerability_type == "SQL Injection" + def test_config_merge_with_defaults(self, tmp_path): """Test merging custom config with defaults.""" # Create minimal custom config