From ac38ec0adebc52c6427704fe0b013a02eed83cfd Mon Sep 17 00:00:00 2001 From: aRustyDev <36318507+aRustyDev@users.noreply.github.com> Date: Thu, 25 Dec 2025 23:37:42 -0500 Subject: [PATCH] feat(skills): import tier 2 priority skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Added - k8s-helm-charts-dev: Helm chart scaffolding (from wshobson/agents) - data-analysis-polars-dev: Polars dataframe patterns (from K-Dense-AI) - lang-rust-errors-dev: Rust error handling (from hashintel/hash) - frontend-testing-playwright-dev: Playwright E2E testing (from lackeyjb) - lang-elixir-patterns-eng: Elixir/OTP architecture (from maxim-ist) All skills renamed to follow naming convention: - Pattern: -[-]- - New categories: k8s, data, lang, frontend πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../skills/data-analysis-polars-dev/SKILL.md | 381 ++++++ .../references/best_practices.md | 649 +++++++++++ .../references/core_concepts.md | 378 ++++++ .../references/io_guide.md | 557 +++++++++ .../references/operations.md | 602 ++++++++++ .../references/pandas_migration.md | 417 +++++++ .../references/transformations.md | 549 +++++++++ .../API_REFERENCE.md | 653 +++++++++++ .../frontend-testing-playwright-dev/SKILL.md | 453 ++++++++ .../lib/helpers.js | 441 +++++++ .../package.json | 26 + .../frontend-testing-playwright-dev/run.js | 228 ++++ .../skills/k8s-helm-charts-dev/SKILL.md | 544 +++++++++ .../assets/Chart.yaml.template | 42 + .../assets/values.yaml.template | 185 +++ .../references/chart-structure.md | 500 ++++++++ .../scripts/validate-chart.sh | 244 ++++ .../skills/lang-elixir-patterns-eng/SKILL.md | 1017 +++++++++++++++++ .../skills/lang-rust-errors-dev/SKILL.md | 130 +++ .../references/defining-errors.md | 277 +++++ .../references/documenting-errors.md | 349 ++++++ .../references/propagating-errors.md | 312 +++++ 22 files changed, 8934 insertions(+) create mode 100644 components/skills/data-analysis-polars-dev/SKILL.md create mode 100644 components/skills/data-analysis-polars-dev/references/best_practices.md create mode 100644 components/skills/data-analysis-polars-dev/references/core_concepts.md create mode 100644 components/skills/data-analysis-polars-dev/references/io_guide.md create mode 100644 components/skills/data-analysis-polars-dev/references/operations.md create mode 100644 components/skills/data-analysis-polars-dev/references/pandas_migration.md create mode 100644 components/skills/data-analysis-polars-dev/references/transformations.md create mode 100644 components/skills/frontend-testing-playwright-dev/API_REFERENCE.md create mode 100644 components/skills/frontend-testing-playwright-dev/SKILL.md create mode 100644 components/skills/frontend-testing-playwright-dev/lib/helpers.js create mode 100644 components/skills/frontend-testing-playwright-dev/package.json create mode 100755 components/skills/frontend-testing-playwright-dev/run.js create mode 100644 components/skills/k8s-helm-charts-dev/SKILL.md create mode 100644 components/skills/k8s-helm-charts-dev/assets/Chart.yaml.template create mode 100644 components/skills/k8s-helm-charts-dev/assets/values.yaml.template create mode 100644 components/skills/k8s-helm-charts-dev/references/chart-structure.md create mode 100755 components/skills/k8s-helm-charts-dev/scripts/validate-chart.sh create mode 100644 components/skills/lang-elixir-patterns-eng/SKILL.md create mode 100644 components/skills/lang-rust-errors-dev/SKILL.md create mode 100644 components/skills/lang-rust-errors-dev/references/defining-errors.md create mode 100644 components/skills/lang-rust-errors-dev/references/documenting-errors.md create mode 100644 components/skills/lang-rust-errors-dev/references/propagating-errors.md diff --git a/components/skills/data-analysis-polars-dev/SKILL.md b/components/skills/data-analysis-polars-dev/SKILL.md new file mode 100644 index 0000000..a2f48e7 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/SKILL.md @@ -0,0 +1,381 @@ +--- +name: polars +description: "Fast DataFrame library (Apache Arrow). Select, filter, group_by, joins, lazy evaluation, CSV/Parquet I/O, expression API, for high-performance data analysis workflows." +--- + +# Polars + +## Overview + +Polars is a lightning-fast DataFrame library for Python and Rust built on Apache Arrow. Work with Polars' expression-based API, lazy evaluation framework, and high-performance data manipulation capabilities for efficient data processing, pandas migration, and data pipeline optimization. + +## Quick Start + +### Installation and Basic Usage + +Install Polars: +```python +uv pip install polars +``` + +Basic DataFrame creation and operations: +```python +import polars as pl + +# Create DataFrame +df = pl.DataFrame({ + "name": ["Alice", "Bob", "Charlie"], + "age": [25, 30, 35], + "city": ["NY", "LA", "SF"] +}) + +# Select columns +df.select("name", "age") + +# Filter rows +df.filter(pl.col("age") > 25) + +# Add computed columns +df.with_columns( + age_plus_10=pl.col("age") + 10 +) +``` + +## Core Concepts + +### Expressions + +Expressions are the fundamental building blocks of Polars operations. They describe transformations on data and can be composed, reused, and optimized. + +**Key principles:** +- Use `pl.col("column_name")` to reference columns +- Chain methods to build complex transformations +- Expressions are lazy and only execute within contexts (select, with_columns, filter, group_by) + +**Example:** +```python +# Expression-based computation +df.select( + pl.col("name"), + (pl.col("age") * 12).alias("age_in_months") +) +``` + +### Lazy vs Eager Evaluation + +**Eager (DataFrame):** Operations execute immediately +```python +df = pl.read_csv("file.csv") # Reads immediately +result = df.filter(pl.col("age") > 25) # Executes immediately +``` + +**Lazy (LazyFrame):** Operations build a query plan, optimized before execution +```python +lf = pl.scan_csv("file.csv") # Doesn't read yet +result = lf.filter(pl.col("age") > 25).select("name", "age") +df = result.collect() # Now executes optimized query +``` + +**When to use lazy:** +- Working with large datasets +- Complex query pipelines +- When only some columns/rows are needed +- Performance is critical + +**Benefits of lazy evaluation:** +- Automatic query optimization +- Predicate pushdown +- Projection pushdown +- Parallel execution + +For detailed concepts, load `references/core_concepts.md`. + +## Common Operations + +### Select +Select and manipulate columns: +```python +# Select specific columns +df.select("name", "age") + +# Select with expressions +df.select( + pl.col("name"), + (pl.col("age") * 2).alias("double_age") +) + +# Select all columns matching a pattern +df.select(pl.col("^.*_id$")) +``` + +### Filter +Filter rows by conditions: +```python +# Single condition +df.filter(pl.col("age") > 25) + +# Multiple conditions (cleaner than using &) +df.filter( + pl.col("age") > 25, + pl.col("city") == "NY" +) + +# Complex conditions +df.filter( + (pl.col("age") > 25) | (pl.col("city") == "LA") +) +``` + +### With Columns +Add or modify columns while preserving existing ones: +```python +# Add new columns +df.with_columns( + age_plus_10=pl.col("age") + 10, + name_upper=pl.col("name").str.to_uppercase() +) + +# Parallel computation (all columns computed in parallel) +df.with_columns( + pl.col("value") * 10, + pl.col("value") * 100, +) +``` + +### Group By and Aggregations +Group data and compute aggregations: +```python +# Basic grouping +df.group_by("city").agg( + pl.col("age").mean().alias("avg_age"), + pl.len().alias("count") +) + +# Multiple group keys +df.group_by("city", "department").agg( + pl.col("salary").sum() +) + +# Conditional aggregations +df.group_by("city").agg( + (pl.col("age") > 30).sum().alias("over_30") +) +``` + +For detailed operation patterns, load `references/operations.md`. + +## Aggregations and Window Functions + +### Aggregation Functions +Common aggregations within `group_by` context: +- `pl.len()` - count rows +- `pl.col("x").sum()` - sum values +- `pl.col("x").mean()` - average +- `pl.col("x").min()` / `pl.col("x").max()` - extremes +- `pl.first()` / `pl.last()` - first/last values + +### Window Functions with `over()` +Apply aggregations while preserving row count: +```python +# Add group statistics to each row +df.with_columns( + avg_age_by_city=pl.col("age").mean().over("city"), + rank_in_city=pl.col("salary").rank().over("city") +) + +# Multiple grouping columns +df.with_columns( + group_avg=pl.col("value").mean().over("category", "region") +) +``` + +**Mapping strategies:** +- `group_to_rows` (default): Preserves original row order +- `explode`: Faster but groups rows together +- `join`: Creates list columns + +## Data I/O + +### Supported Formats +Polars supports reading and writing: +- CSV, Parquet, JSON, Excel +- Databases (via connectors) +- Cloud storage (S3, Azure, GCS) +- Google BigQuery +- Multiple/partitioned files + +### Common I/O Operations + +**CSV:** +```python +# Eager +df = pl.read_csv("file.csv") +df.write_csv("output.csv") + +# Lazy (preferred for large files) +lf = pl.scan_csv("file.csv") +result = lf.filter(...).select(...).collect() +``` + +**Parquet (recommended for performance):** +```python +df = pl.read_parquet("file.parquet") +df.write_parquet("output.parquet") +``` + +**JSON:** +```python +df = pl.read_json("file.json") +df.write_json("output.json") +``` + +For comprehensive I/O documentation, load `references/io_guide.md`. + +## Transformations + +### Joins +Combine DataFrames: +```python +# Inner join +df1.join(df2, on="id", how="inner") + +# Left join +df1.join(df2, on="id", how="left") + +# Join on different column names +df1.join(df2, left_on="user_id", right_on="id") +``` + +### Concatenation +Stack DataFrames: +```python +# Vertical (stack rows) +pl.concat([df1, df2], how="vertical") + +# Horizontal (add columns) +pl.concat([df1, df2], how="horizontal") + +# Diagonal (union with different schemas) +pl.concat([df1, df2], how="diagonal") +``` + +### Pivot and Unpivot +Reshape data: +```python +# Pivot (wide format) +df.pivot(values="sales", index="date", columns="product") + +# Unpivot (long format) +df.unpivot(index="id", on=["col1", "col2"]) +``` + +For detailed transformation examples, load `references/transformations.md`. + +## Pandas Migration + +Polars offers significant performance improvements over pandas with a cleaner API. Key differences: + +### Conceptual Differences +- **No index**: Polars uses integer positions only +- **Strict typing**: No silent type conversions +- **Lazy evaluation**: Available via LazyFrame +- **Parallel by default**: Operations parallelized automatically + +### Common Operation Mappings + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Select column | `df["col"]` | `df.select("col")` | +| Filter | `df[df["col"] > 10]` | `df.filter(pl.col("col") > 10)` | +| Add column | `df.assign(x=...)` | `df.with_columns(x=...)` | +| Group by | `df.groupby("col").agg(...)` | `df.group_by("col").agg(...)` | +| Window | `df.groupby("col").transform(...)` | `df.with_columns(...).over("col")` | + +### Key Syntax Patterns + +**Pandas sequential (slow):** +```python +df.assign( + col_a=lambda df_: df_.value * 10, + col_b=lambda df_: df_.value * 100 +) +``` + +**Polars parallel (fast):** +```python +df.with_columns( + col_a=pl.col("value") * 10, + col_b=pl.col("value") * 100, +) +``` + +For comprehensive migration guide, load `references/pandas_migration.md`. + +## Best Practices + +### Performance Optimization + +1. **Use lazy evaluation for large datasets:** + ```python + lf = pl.scan_csv("large.csv") # Don't use read_csv + result = lf.filter(...).select(...).collect() + ``` + +2. **Avoid Python functions in hot paths:** + - Stay within expression API for parallelization + - Use `.map_elements()` only when necessary + - Prefer native Polars operations + +3. **Use streaming for very large data:** + ```python + lf.collect(streaming=True) + ``` + +4. **Select only needed columns early:** + ```python + # Good: Select columns early + lf.select("col1", "col2").filter(...) + + # Bad: Filter on all columns first + lf.filter(...).select("col1", "col2") + ``` + +5. **Use appropriate data types:** + - Categorical for low-cardinality strings + - Appropriate integer sizes (i32 vs i64) + - Date types for temporal data + +### Expression Patterns + +**Conditional operations:** +```python +pl.when(condition).then(value).otherwise(other_value) +``` + +**Column operations across multiple columns:** +```python +df.select(pl.col("^.*_value$") * 2) # Regex pattern +``` + +**Null handling:** +```python +pl.col("x").fill_null(0) +pl.col("x").is_null() +pl.col("x").drop_nulls() +``` + +For additional best practices and patterns, load `references/best_practices.md`. + +## Resources + +This skill includes comprehensive reference documentation: + +### references/ +- `core_concepts.md` - Detailed explanations of expressions, lazy evaluation, and type system +- `operations.md` - Comprehensive guide to all common operations with examples +- `pandas_migration.md` - Complete migration guide from pandas to Polars +- `io_guide.md` - Data I/O operations for all supported formats +- `transformations.md` - Joins, concatenation, pivots, and reshaping operations +- `best_practices.md` - Performance optimization tips and common patterns + +Load these references as needed when users require detailed information about specific topics. diff --git a/components/skills/data-analysis-polars-dev/references/best_practices.md b/components/skills/data-analysis-polars-dev/references/best_practices.md new file mode 100644 index 0000000..0585ed6 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/best_practices.md @@ -0,0 +1,649 @@ +# Polars Best Practices and Performance Guide + +Comprehensive guide to writing efficient Polars code and avoiding common pitfalls. + +## Performance Optimization + +### 1. Use Lazy Evaluation + +**Always prefer lazy mode for large datasets:** + +```python +# Bad: Eager mode loads everything immediately +df = pl.read_csv("large_file.csv") +result = df.filter(pl.col("age") > 25).select("name", "age") + +# Good: Lazy mode optimizes before execution +lf = pl.scan_csv("large_file.csv") +result = lf.filter(pl.col("age") > 25).select("name", "age").collect() +``` + +**Benefits of lazy evaluation:** +- Predicate pushdown (filter at source) +- Projection pushdown (read only needed columns) +- Query optimization +- Parallel execution planning + +### 2. Filter and Select Early + +Push filters and column selection as early as possible in the pipeline: + +```python +# Bad: Process all data, then filter and select +result = ( + lf.group_by("category") + .agg(pl.col("value").mean()) + .join(other, on="category") + .filter(pl.col("value") > 100) + .select("category", "value") +) + +# Good: Filter and select early +result = ( + lf.select("category", "value") # Only needed columns + .filter(pl.col("value") > 100) # Filter early + .group_by("category") + .agg(pl.col("value").mean()) + .join(other.select("category", "other_col"), on="category") +) +``` + +### 3. Avoid Python Functions + +Stay within the expression API to maintain parallelization: + +```python +# Bad: Python function disables parallelization +df = df.with_columns( + result=pl.col("value").map_elements(lambda x: x * 2, return_dtype=pl.Float64) +) + +# Good: Use native expressions (parallelized) +df = df.with_columns(result=pl.col("value") * 2) +``` + +**When you must use custom functions:** +```python +# If truly needed, be explicit +df = df.with_columns( + result=pl.col("value").map_elements( + custom_function, + return_dtype=pl.Float64, + skip_nulls=True # Optimize null handling + ) +) +``` + +### 4. Use Streaming for Very Large Data + +Enable streaming for datasets larger than RAM: + +```python +# Streaming mode processes data in chunks +lf = pl.scan_parquet("very_large.parquet") +result = lf.filter(pl.col("value") > 100).collect(streaming=True) + +# Or use sink for direct streaming writes +lf.filter(pl.col("value") > 100).sink_parquet("output.parquet") +``` + +### 5. Optimize Data Types + +Choose appropriate data types to reduce memory and improve performance: + +```python +# Bad: Default types may be wasteful +df = pl.read_csv("data.csv") + +# Good: Specify optimal types +df = pl.read_csv( + "data.csv", + dtypes={ + "id": pl.UInt32, # Instead of Int64 if values fit + "category": pl.Categorical, # For low-cardinality strings + "date": pl.Date, # Instead of String + "small_int": pl.Int16, # Instead of Int64 + } +) +``` + +**Type optimization guidelines:** +- Use smallest integer type that fits your data +- Use `Categorical` for strings with low cardinality (<50% unique) +- Use `Date` instead of `Datetime` when time isn't needed +- Use `Boolean` instead of integers for binary flags + +### 6. Parallel Operations + +Structure code to maximize parallelization: + +```python +# Bad: Sequential pipe operations disable parallelization +df = ( + df.pipe(operation1) + .pipe(operation2) + .pipe(operation3) +) + +# Good: Combined operations enable parallelization +df = df.with_columns( + result1=operation1_expr(), + result2=operation2_expr(), + result3=operation3_expr() +) +``` + +### 7. Rechunk After Concatenation + +```python +# Concatenation can fragment data +combined = pl.concat([df1, df2, df3]) + +# Rechunk for better performance in subsequent operations +combined = pl.concat([df1, df2, df3], rechunk=True) +``` + +## Expression Patterns + +### Conditional Logic + +**Simple conditions:** +```python +df.with_columns( + status=pl.when(pl.col("age") >= 18) + .then("adult") + .otherwise("minor") +) +``` + +**Multiple conditions:** +```python +df.with_columns( + grade=pl.when(pl.col("score") >= 90) + .then("A") + .when(pl.col("score") >= 80) + .then("B") + .when(pl.col("score") >= 70) + .then("C") + .when(pl.col("score") >= 60) + .then("D") + .otherwise("F") +) +``` + +**Complex conditions:** +```python +df.with_columns( + category=pl.when( + (pl.col("revenue") > 1000000) & (pl.col("customers") > 100) + ) + .then("enterprise") + .when( + (pl.col("revenue") > 100000) | (pl.col("customers") > 50) + ) + .then("business") + .otherwise("starter") +) +``` + +### Null Handling + +**Check for nulls:** +```python +df.filter(pl.col("value").is_null()) +df.filter(pl.col("value").is_not_null()) +``` + +**Fill nulls:** +```python +# Constant value +df.with_columns(pl.col("value").fill_null(0)) + +# Forward fill +df.with_columns(pl.col("value").fill_null(strategy="forward")) + +# Backward fill +df.with_columns(pl.col("value").fill_null(strategy="backward")) + +# Mean +df.with_columns(pl.col("value").fill_null(strategy="mean")) + +# Per-group fill +df.with_columns( + pl.col("value").fill_null(pl.col("value").mean()).over("group") +) +``` + +**Coalesce (first non-null):** +```python +df.with_columns( + combined=pl.coalesce(["col1", "col2", "col3"]) +) +``` + +### Column Selection Patterns + +**By name:** +```python +df.select("col1", "col2", "col3") +``` + +**By pattern:** +```python +# Regex +df.select(pl.col("^sales_.*$")) + +# Starts with +df.select(pl.col("^sales")) + +# Ends with +df.select(pl.col("_total$")) + +# Contains +df.select(pl.col(".*revenue.*")) +``` + +**By type:** +```python +# All numeric columns +df.select(pl.col(pl.NUMERIC_DTYPES)) + +# All string columns +df.select(pl.col(pl.Utf8)) + +# Multiple types +df.select(pl.col(pl.NUMERIC_DTYPES, pl.Boolean)) +``` + +**Exclude columns:** +```python +df.select(pl.all().exclude("id", "timestamp")) +``` + +**Transform multiple columns:** +```python +# Apply same operation to multiple columns +df.select( + pl.col("^sales_.*$") * 1.1 # 10% increase to all sales columns +) +``` + +### Aggregation Patterns + +**Multiple aggregations:** +```python +df.group_by("category").agg( + pl.col("value").sum().alias("total"), + pl.col("value").mean().alias("average"), + pl.col("value").std().alias("std_dev"), + pl.col("id").count().alias("count"), + pl.col("id").n_unique().alias("unique_count"), + pl.col("value").min().alias("minimum"), + pl.col("value").max().alias("maximum"), + pl.col("value").quantile(0.5).alias("median"), + pl.col("value").quantile(0.95).alias("p95") +) +``` + +**Conditional aggregations:** +```python +df.group_by("category").agg( + # Count high values + (pl.col("value") > 100).sum().alias("high_count"), + + # Average of filtered values + pl.col("value").filter(pl.col("active")).mean().alias("active_avg"), + + # Conditional sum + pl.when(pl.col("status") == "completed") + .then(pl.col("amount")) + .otherwise(0) + .sum() + .alias("completed_total") +) +``` + +**Grouped transformations:** +```python +df.with_columns( + # Group statistics + group_mean=pl.col("value").mean().over("category"), + group_std=pl.col("value").std().over("category"), + + # Rank within groups + rank=pl.col("value").rank().over("category"), + + # Percentage of group total + pct_of_group=(pl.col("value") / pl.col("value").sum().over("category")) * 100 +) +``` + +## Common Pitfalls and Anti-Patterns + +### Pitfall 1: Row Iteration + +```python +# Bad: Never iterate rows +for row in df.iter_rows(): + # Process row + result = row[0] * 2 + +# Good: Use vectorized operations +df = df.with_columns(result=pl.col("value") * 2) +``` + +### Pitfall 2: Modifying in Place + +```python +# Bad: Polars is immutable, this doesn't work as expected +df["new_col"] = df["old_col"] * 2 # May work but not recommended + +# Good: Functional style +df = df.with_columns(new_col=pl.col("old_col") * 2) +``` + +### Pitfall 3: Not Using Expressions + +```python +# Bad: String-based operations +df.select("value * 2") # Won't work + +# Good: Expression-based +df.select(pl.col("value") * 2) +``` + +### Pitfall 4: Inefficient Joins + +```python +# Bad: Join large tables without filtering +result = large_df1.join(large_df2, on="id") + +# Good: Filter before joining +result = ( + large_df1.filter(pl.col("active")) + .join( + large_df2.filter(pl.col("status") == "valid"), + on="id" + ) +) +``` + +### Pitfall 5: Not Specifying Types + +```python +# Bad: Let Polars infer everything +df = pl.read_csv("data.csv") + +# Good: Specify types for correctness and performance +df = pl.read_csv( + "data.csv", + dtypes={"id": pl.Int64, "date": pl.Date, "category": pl.Categorical} +) +``` + +### Pitfall 6: Creating Many Small DataFrames + +```python +# Bad: Many operations creating intermediate DataFrames +df1 = df.filter(pl.col("age") > 25) +df2 = df1.select("name", "age") +df3 = df2.sort("age") +result = df3.head(10) + +# Good: Chain operations +result = ( + df.filter(pl.col("age") > 25) + .select("name", "age") + .sort("age") + .head(10) +) + +# Better: Use lazy mode +result = ( + df.lazy() + .filter(pl.col("age") > 25) + .select("name", "age") + .sort("age") + .head(10) + .collect() +) +``` + +## Memory Management + +### Monitor Memory Usage + +```python +# Check DataFrame size +print(f"Estimated size: {df.estimated_size('mb'):.2f} MB") + +# Profile memory during operations +lf = pl.scan_csv("large.csv") +print(lf.explain()) # See query plan +``` + +### Reduce Memory Footprint + +```python +# 1. Use lazy mode +lf = pl.scan_parquet("data.parquet") + +# 2. Stream results +result = lf.collect(streaming=True) + +# 3. Select only needed columns +lf = lf.select("col1", "col2") + +# 4. Optimize data types +df = df.with_columns( + pl.col("int_col").cast(pl.Int32), # Downcast if possible + pl.col("category").cast(pl.Categorical) # For low cardinality +) + +# 5. Drop columns not needed +df = df.drop("large_text_col", "unused_col") +``` + +## Testing and Debugging + +### Inspect Query Plans + +```python +lf = pl.scan_csv("data.csv") +query = lf.filter(pl.col("age") > 25).select("name", "age") + +# View the optimized query plan +print(query.explain()) + +# View detailed query plan +print(query.explain(optimized=True)) +``` + +### Sample Data for Development + +```python +# Use n_rows for testing +df = pl.read_csv("large.csv", n_rows=1000) + +# Or sample after reading +df_sample = df.sample(n=1000, seed=42) +``` + +### Validate Schemas + +```python +# Check schema +print(df.schema) + +# Ensure schema matches expectation +expected_schema = { + "id": pl.Int64, + "name": pl.Utf8, + "date": pl.Date +} + +assert df.schema == expected_schema +``` + +### Profile Performance + +```python +import time + +# Time operations +start = time.time() +result = lf.collect() +print(f"Execution time: {time.time() - start:.2f}s") + +# Compare eager vs lazy +start = time.time() +df_eager = pl.read_csv("data.csv").filter(pl.col("age") > 25) +eager_time = time.time() - start + +start = time.time() +df_lazy = pl.scan_csv("data.csv").filter(pl.col("age") > 25).collect() +lazy_time = time.time() - start + +print(f"Eager: {eager_time:.2f}s, Lazy: {lazy_time:.2f}s") +``` + +## File Format Best Practices + +### Choose the Right Format + +**Parquet:** +- Best for: Large datasets, archival, data lakes +- Pros: Excellent compression, columnar, fast reads +- Cons: Not human-readable + +**CSV:** +- Best for: Small datasets, human inspection, legacy systems +- Pros: Universal, human-readable +- Cons: Slow, large file size, no type preservation + +**Arrow IPC:** +- Best for: Inter-process communication, temporary storage +- Pros: Fastest, zero-copy, preserves all types +- Cons: Less compression than Parquet + +### File Reading Best Practices + +```python +# 1. Use lazy reading +lf = pl.scan_parquet("data.parquet") # Not read_parquet + +# 2. Read multiple files efficiently +lf = pl.scan_parquet("data/*.parquet") # Parallel reading + +# 3. Specify schema when known +lf = pl.scan_csv( + "data.csv", + dtypes={"id": pl.Int64, "date": pl.Date} +) + +# 4. Use predicate pushdown +result = lf.filter(pl.col("date") >= "2023-01-01").collect() +``` + +### File Writing Best Practices + +```python +# 1. Use Parquet for large data +df.write_parquet("output.parquet", compression="zstd") + +# 2. Partition large datasets +df.write_parquet("output", partition_by=["year", "month"]) + +# 3. Use streaming for very large writes +lf.sink_parquet("output.parquet") # Streaming write + +# 4. Optimize compression +df.write_parquet( + "output.parquet", + compression="snappy", # Fast compression + statistics=True # Enable predicate pushdown on read +) +``` + +## Code Organization + +### Reusable Expressions + +```python +# Define reusable expressions +age_group = ( + pl.when(pl.col("age") < 18) + .then("minor") + .when(pl.col("age") < 65) + .then("adult") + .otherwise("senior") +) + +revenue_per_customer = pl.col("revenue") / pl.col("customer_count") + +# Use in multiple contexts +df = df.with_columns( + age_group=age_group, + rpc=revenue_per_customer +) + +# Reuse in filtering +df = df.filter(revenue_per_customer > 100) +``` + +### Pipeline Functions + +```python +def clean_data(lf: pl.LazyFrame) -> pl.LazyFrame: + """Clean and standardize data.""" + return lf.with_columns( + pl.col("name").str.to_uppercase(), + pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"), + pl.col("amount").fill_null(0) + ) + +def add_features(lf: pl.LazyFrame) -> pl.LazyFrame: + """Add computed features.""" + return lf.with_columns( + month=pl.col("date").dt.month(), + year=pl.col("date").dt.year(), + amount_log=pl.col("amount").log() + ) + +# Compose pipeline +result = ( + pl.scan_csv("data.csv") + .pipe(clean_data) + .pipe(add_features) + .filter(pl.col("year") == 2023) + .collect() +) +``` + +## Documentation + +Always document complex expressions and transformations: + +```python +# Good: Document intent +df = df.with_columns( + # Calculate customer lifetime value as sum of purchases + # divided by months since first purchase + clv=( + pl.col("total_purchases") / + ((pl.col("last_purchase_date") - pl.col("first_purchase_date")) + .dt.total_days() / 30) + ) +) +``` + +## Version Compatibility + +```python +# Check Polars version +import polars as pl +print(pl.__version__) + +# Feature availability varies by version +# Document version requirements for production code +``` diff --git a/components/skills/data-analysis-polars-dev/references/core_concepts.md b/components/skills/data-analysis-polars-dev/references/core_concepts.md new file mode 100644 index 0000000..e3a0e56 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/core_concepts.md @@ -0,0 +1,378 @@ +# Polars Core Concepts + +## Expressions + +Expressions are the foundation of Polars' API. They are composable units that describe data transformations without executing them immediately. + +### What are Expressions? + +An expression describes a transformation on data. It only materializes (executes) within specific contexts: +- `select()` - Select and transform columns +- `with_columns()` - Add or modify columns +- `filter()` - Filter rows +- `group_by().agg()` - Aggregate data + +### Expression Syntax + +**Basic column reference:** +```python +pl.col("column_name") +``` + +**Computed expressions:** +```python +# Arithmetic +pl.col("height") * 2 +pl.col("price") + pl.col("tax") + +# With alias +(pl.col("weight") / (pl.col("height") ** 2)).alias("bmi") + +# Method chaining +pl.col("name").str.to_uppercase().str.slice(0, 3) +``` + +### Expression Contexts + +**Select context:** +```python +df.select( + "name", # Simple column name + pl.col("age"), # Expression + (pl.col("age") * 12).alias("age_in_months") # Computed expression +) +``` + +**With_columns context:** +```python +df.with_columns( + age_doubled=pl.col("age") * 2, + name_upper=pl.col("name").str.to_uppercase() +) +``` + +**Filter context:** +```python +df.filter( + pl.col("age") > 25, + pl.col("city").is_in(["NY", "LA", "SF"]) +) +``` + +**Group_by context:** +```python +df.group_by("department").agg( + pl.col("salary").mean(), + pl.col("employee_id").count() +) +``` + +### Expression Expansion + +Apply operations to multiple columns at once: + +**All columns:** +```python +df.select(pl.all() * 2) +``` + +**Pattern matching:** +```python +# All columns ending with "_value" +df.select(pl.col("^.*_value$") * 100) + +# All numeric columns +df.select(pl.col(pl.NUMERIC_DTYPES) + 1) +``` + +**Exclude patterns:** +```python +df.select(pl.all().exclude("id", "name")) +``` + +### Expression Composition + +Expressions can be stored and reused: + +```python +# Define reusable expressions +age_expression = pl.col("age") * 12 +name_expression = pl.col("name").str.to_uppercase() + +# Use in multiple contexts +df.select(age_expression, name_expression) +df.with_columns(age_months=age_expression) +``` + +## Data Types + +Polars has a strict type system based on Apache Arrow. + +### Core Data Types + +**Numeric:** +- `Int8`, `Int16`, `Int32`, `Int64` - Signed integers +- `UInt8`, `UInt16`, `UInt32`, `UInt64` - Unsigned integers +- `Float32`, `Float64` - Floating point numbers + +**Text:** +- `Utf8` / `String` - UTF-8 encoded strings +- `Categorical` - Categorized strings (low cardinality) +- `Enum` - Fixed set of string values + +**Temporal:** +- `Date` - Calendar date (no time) +- `Datetime` - Date and time with optional timezone +- `Time` - Time of day +- `Duration` - Time duration/difference + +**Boolean:** +- `Boolean` - True/False values + +**Nested:** +- `List` - Variable-length lists +- `Array` - Fixed-length arrays +- `Struct` - Nested record structures + +**Other:** +- `Binary` - Binary data +- `Object` - Python objects (avoid in production) +- `Null` - Null type + +### Type Casting + +Convert between types explicitly: + +```python +# Cast to different type +df.select( + pl.col("age").cast(pl.Float64), + pl.col("date_string").str.strptime(pl.Date, "%Y-%m-%d"), + pl.col("id").cast(pl.Utf8) +) +``` + +### Null Handling + +Polars uses consistent null handling across all types: + +**Check for nulls:** +```python +df.filter(pl.col("value").is_null()) +df.filter(pl.col("value").is_not_null()) +``` + +**Fill nulls:** +```python +pl.col("value").fill_null(0) +pl.col("value").fill_null(strategy="forward") +pl.col("value").fill_null(strategy="backward") +pl.col("value").fill_null(strategy="mean") +``` + +**Drop nulls:** +```python +df.drop_nulls() # Drop any row with nulls +df.drop_nulls(subset=["col1", "col2"]) # Drop rows with nulls in specific columns +``` + +### Categorical Data + +Use categorical types for string columns with low cardinality (repeated values): + +```python +# Cast to categorical +df.with_columns( + pl.col("category").cast(pl.Categorical) +) + +# Benefits: +# - Reduced memory usage +# - Faster grouping and joining +# - Maintains order information +``` + +## Lazy vs Eager Evaluation + +Polars supports two execution modes: eager (DataFrame) and lazy (LazyFrame). + +### Eager Evaluation (DataFrame) + +Operations execute immediately: + +```python +import polars as pl + +# DataFrame operations execute right away +df = pl.read_csv("data.csv") # Reads file immediately +result = df.filter(pl.col("age") > 25) # Filters immediately +final = result.select("name", "age") # Selects immediately +``` + +**When to use eager:** +- Small datasets that fit in memory +- Interactive exploration in notebooks +- Simple one-off operations +- Immediate feedback needed + +### Lazy Evaluation (LazyFrame) + +Operations build a query plan, optimized before execution: + +```python +import polars as pl + +# LazyFrame operations build a query plan +lf = pl.scan_csv("data.csv") # Doesn't read yet +lf2 = lf.filter(pl.col("age") > 25) # Adds to plan +lf3 = lf2.select("name", "age") # Adds to plan +df = lf3.collect() # NOW executes optimized plan +``` + +**When to use lazy:** +- Large datasets +- Complex query pipelines +- Only need subset of data +- Performance is critical +- Streaming required + +### Query Optimization + +Polars automatically optimizes lazy queries: + +**Predicate Pushdown:** +Filter operations pushed to data source when possible: +```python +# Only reads rows where age > 25 from CSV +lf = pl.scan_csv("data.csv") +result = lf.filter(pl.col("age") > 25).collect() +``` + +**Projection Pushdown:** +Only read needed columns from data source: +```python +# Only reads "name" and "age" columns from CSV +lf = pl.scan_csv("data.csv") +result = lf.select("name", "age").collect() +``` + +**Query Plan Inspection:** +```python +# View the optimized query plan +lf = pl.scan_csv("data.csv") +result = lf.filter(pl.col("age") > 25).select("name", "age") +print(result.explain()) # Shows optimized plan +``` + +### Streaming Mode + +Process data larger than memory: + +```python +# Enable streaming for very large datasets +lf = pl.scan_csv("very_large.csv") +result = lf.filter(pl.col("age") > 25).collect(streaming=True) +``` + +**Streaming benefits:** +- Process data larger than RAM +- Lower peak memory usage +- Chunk-based processing +- Automatic memory management + +**Streaming limitations:** +- Not all operations support streaming +- May be slower for small data +- Some operations require materializing entire dataset + +### Converting Between Eager and Lazy + +**Eager to Lazy:** +```python +df = pl.read_csv("data.csv") +lf = df.lazy() # Convert to LazyFrame +``` + +**Lazy to Eager:** +```python +lf = pl.scan_csv("data.csv") +df = lf.collect() # Execute and return DataFrame +``` + +## Memory Format + +Polars uses Apache Arrow columnar memory format: + +**Benefits:** +- Zero-copy data sharing with other Arrow libraries +- Efficient columnar operations +- SIMD vectorization +- Reduced memory overhead +- Fast serialization + +**Implications:** +- Data stored column-wise, not row-wise +- Column operations very fast +- Random row access slower than pandas +- Best for analytical workloads + +## Parallelization + +Polars parallelizes operations automatically using Rust's concurrency: + +**What gets parallelized:** +- Aggregations within groups +- Window functions +- Most expression evaluations +- File reading (multiple files) +- Join operations + +**What to avoid for parallelization:** +- Python user-defined functions (UDFs) +- Lambda functions in `.map_elements()` +- Sequential `.pipe()` chains + +**Best practice:** +```python +# Good: Stays in expression API (parallelized) +df.with_columns( + pl.col("value") * 10, + pl.col("value").log(), + pl.col("value").sqrt() +) + +# Bad: Uses Python function (sequential) +df.with_columns( + pl.col("value").map_elements(lambda x: x * 10) +) +``` + +## Strict Type System + +Polars enforces strict typing: + +**No silent conversions:** +```python +# This will error - can't mix types +# df.with_columns(pl.col("int_col") + "string") + +# Must cast explicitly +df.with_columns( + pl.col("int_col").cast(pl.Utf8) + "_suffix" +) +``` + +**Benefits:** +- Prevents silent bugs +- Predictable behavior +- Better performance +- Clearer code intent + +**Integer nulls:** +Unlike pandas, integer columns can have nulls without converting to float: +```python +# In pandas: Int column with null becomes Float +# In polars: Int column with null stays Int (with null values) +df = pl.DataFrame({"int_col": [1, 2, None, 4]}) +# dtype: Int64 (not Float64) +``` diff --git a/components/skills/data-analysis-polars-dev/references/io_guide.md b/components/skills/data-analysis-polars-dev/references/io_guide.md new file mode 100644 index 0000000..bbb9dc9 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/io_guide.md @@ -0,0 +1,557 @@ +# Polars Data I/O Guide + +Comprehensive guide to reading and writing data in various formats with Polars. + +## CSV Files + +### Reading CSV + +**Eager mode (loads into memory):** +```python +import polars as pl + +# Basic read +df = pl.read_csv("data.csv") + +# With options +df = pl.read_csv( + "data.csv", + separator=",", + has_header=True, + columns=["col1", "col2"], # Select specific columns + n_rows=1000, # Read only first 1000 rows + skip_rows=10, # Skip first 10 rows + dtypes={"col1": pl.Int64, "col2": pl.Utf8}, # Specify types + null_values=["NA", "null", ""], # Define null values + encoding="utf-8", + ignore_errors=False +) +``` + +**Lazy mode (scans without loading - recommended for large files):** +```python +# Scan CSV (builds query plan) +lf = pl.scan_csv("data.csv") + +# Apply operations +result = lf.filter(pl.col("age") > 25).select("name", "age") + +# Execute and load +df = result.collect() +``` + +### Writing CSV + +```python +# Basic write +df.write_csv("output.csv") + +# With options +df.write_csv( + "output.csv", + separator=",", + include_header=True, + null_value="", # How to represent nulls + quote_char='"', + line_terminator="\n" +) +``` + +### Multiple CSV Files + +**Read multiple files:** +```python +# Read all CSVs in directory +lf = pl.scan_csv("data/*.csv") + +# Read specific files +lf = pl.scan_csv(["file1.csv", "file2.csv", "file3.csv"]) +``` + +## Parquet Files + +Parquet is the recommended format for performance and compression. + +### Reading Parquet + +**Eager:** +```python +df = pl.read_parquet("data.parquet") + +# With options +df = pl.read_parquet( + "data.parquet", + columns=["col1", "col2"], # Select specific columns + n_rows=1000, # Read first N rows + parallel="auto" # Control parallelization +) +``` + +**Lazy (recommended):** +```python +lf = pl.scan_parquet("data.parquet") + +# Automatic predicate and projection pushdown +result = lf.filter(pl.col("age") > 25).select("name", "age").collect() +``` + +### Writing Parquet + +```python +# Basic write +df.write_parquet("output.parquet") + +# With compression +df.write_parquet( + "output.parquet", + compression="snappy", # Options: "snappy", "gzip", "brotli", "lz4", "zstd" + statistics=True, # Write statistics (enables predicate pushdown) + use_pyarrow=False # Use Rust writer (faster) +) +``` + +### Partitioned Parquet (Hive-style) + +**Write partitioned:** +```python +# Write with partitioning +df.write_parquet( + "output_dir", + partition_by=["year", "month"] # Creates directory structure +) +# Creates: output_dir/year=2023/month=01/data.parquet +``` + +**Read partitioned:** +```python +lf = pl.scan_parquet("output_dir/**/*.parquet") + +# Hive partitioning columns are automatically added +result = lf.filter(pl.col("year") == 2023).collect() +``` + +## JSON Files + +### Reading JSON + +**NDJSON (newline-delimited JSON) - recommended:** +```python +df = pl.read_ndjson("data.ndjson") + +# Lazy +lf = pl.scan_ndjson("data.ndjson") +``` + +**Standard JSON:** +```python +df = pl.read_json("data.json") + +# From JSON string +df = pl.read_json('{"col1": [1, 2], "col2": ["a", "b"]}') +``` + +### Writing JSON + +```python +# Write NDJSON +df.write_ndjson("output.ndjson") + +# Write standard JSON +df.write_json("output.json") + +# Pretty printed +df.write_json("output.json", pretty=True, row_oriented=False) +``` + +## Excel Files + +### Reading Excel + +```python +# Read first sheet +df = pl.read_excel("data.xlsx") + +# Specific sheet +df = pl.read_excel("data.xlsx", sheet_name="Sheet1") +# Or by index +df = pl.read_excel("data.xlsx", sheet_id=0) + +# With options +df = pl.read_excel( + "data.xlsx", + sheet_name="Sheet1", + columns=["A", "B", "C"], # Excel columns + n_rows=100, + skip_rows=5, + has_header=True +) +``` + +### Writing Excel + +```python +# Write to Excel +df.write_excel("output.xlsx") + +# Multiple sheets +with pl.ExcelWriter("output.xlsx") as writer: + df1.write_excel(writer, worksheet="Sheet1") + df2.write_excel(writer, worksheet="Sheet2") +``` + +## Database Connectivity + +### Read from Database + +```python +import polars as pl + +# Read entire table +df = pl.read_database("SELECT * FROM users", connection_uri="postgresql://...") + +# Using connectorx for better performance +df = pl.read_database_uri( + "SELECT * FROM users WHERE age > 25", + uri="postgresql://user:pass@localhost/db" +) +``` + +### Write to Database + +```python +# Using SQLAlchemy +from sqlalchemy import create_engine + +engine = create_engine("postgresql://user:pass@localhost/db") +df.write_database("table_name", connection=engine) + +# With options +df.write_database( + "table_name", + connection=engine, + if_exists="replace", # or "append", "fail" +) +``` + +### Common Database Connectors + +**PostgreSQL:** +```python +uri = "postgresql://username:password@localhost:5432/database" +df = pl.read_database_uri("SELECT * FROM table", uri=uri) +``` + +**MySQL:** +```python +uri = "mysql://username:password@localhost:3306/database" +df = pl.read_database_uri("SELECT * FROM table", uri=uri) +``` + +**SQLite:** +```python +uri = "sqlite:///path/to/database.db" +df = pl.read_database_uri("SELECT * FROM table", uri=uri) +``` + +## Cloud Storage + +### AWS S3 + +```python +# Read from S3 +df = pl.read_parquet("s3://bucket/path/to/file.parquet") +lf = pl.scan_parquet("s3://bucket/path/*.parquet") + +# Write to S3 +df.write_parquet("s3://bucket/path/output.parquet") + +# With credentials +import os +os.environ["AWS_ACCESS_KEY_ID"] = "your_key" +os.environ["AWS_SECRET_ACCESS_KEY"] = "your_secret" +os.environ["AWS_REGION"] = "us-west-2" + +df = pl.read_parquet("s3://bucket/file.parquet") +``` + +### Azure Blob Storage + +```python +# Read from Azure +df = pl.read_parquet("az://container/path/file.parquet") + +# Write to Azure +df.write_parquet("az://container/path/output.parquet") + +# With credentials +os.environ["AZURE_STORAGE_ACCOUNT_NAME"] = "account" +os.environ["AZURE_STORAGE_ACCOUNT_KEY"] = "key" +``` + +### Google Cloud Storage (GCS) + +```python +# Read from GCS +df = pl.read_parquet("gs://bucket/path/file.parquet") + +# Write to GCS +df.write_parquet("gs://bucket/path/output.parquet") + +# With credentials +os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/path/to/credentials.json" +``` + +## Google BigQuery + +```python +# Read from BigQuery +df = pl.read_database( + "SELECT * FROM project.dataset.table", + connection_uri="bigquery://project" +) + +# Or using Google Cloud SDK +from google.cloud import bigquery +client = bigquery.Client() + +query = "SELECT * FROM project.dataset.table WHERE date > '2023-01-01'" +df = pl.from_pandas(client.query(query).to_dataframe()) +``` + +## Apache Arrow + +### IPC/Feather Format + +**Read:** +```python +df = pl.read_ipc("data.arrow") +lf = pl.scan_ipc("data.arrow") +``` + +**Write:** +```python +df.write_ipc("output.arrow") + +# Compressed +df.write_ipc("output.arrow", compression="zstd") +``` + +### Arrow Streaming + +```python +# Write streaming format +df.write_ipc("output.arrows", compression="zstd") + +# Read streaming +df = pl.read_ipc("output.arrows") +``` + +### From/To Arrow + +```python +import pyarrow as pa + +# From Arrow Table +arrow_table = pa.table({"col": [1, 2, 3]}) +df = pl.from_arrow(arrow_table) + +# To Arrow Table +arrow_table = df.to_arrow() +``` + +## In-Memory Formats + +### Python Dictionaries + +```python +# From dict +df = pl.DataFrame({ + "col1": [1, 2, 3], + "col2": ["a", "b", "c"] +}) + +# To dict +data_dict = df.to_dict() # Column-oriented +data_dict = df.to_dict(as_series=False) # Lists instead of Series +``` + +### NumPy Arrays + +```python +import numpy as np + +# From NumPy +arr = np.array([[1, 2], [3, 4], [5, 6]]) +df = pl.DataFrame(arr, schema=["col1", "col2"]) + +# To NumPy +arr = df.to_numpy() +``` + +### Pandas DataFrames + +```python +import pandas as pd + +# From Pandas +pd_df = pd.DataFrame({"col": [1, 2, 3]}) +pl_df = pl.from_pandas(pd_df) + +# To Pandas +pd_df = pl_df.to_pandas() + +# Zero-copy when possible +pl_df = pl.from_arrow(pd_df) +``` + +### Lists of Rows + +```python +# From list of dicts +data = [ + {"name": "Alice", "age": 25}, + {"name": "Bob", "age": 30} +] +df = pl.DataFrame(data) + +# To list of dicts +rows = df.to_dicts() + +# From list of tuples +data = [("Alice", 25), ("Bob", 30)] +df = pl.DataFrame(data, schema=["name", "age"]) +``` + +## Streaming Large Files + +For datasets larger than memory, use lazy mode with streaming: + +```python +# Streaming mode +lf = pl.scan_csv("very_large.csv") +result = lf.filter(pl.col("value") > 100).collect(streaming=True) + +# Streaming with multiple files +lf = pl.scan_parquet("data/*.parquet") +result = lf.group_by("category").agg(pl.col("value").sum()).collect(streaming=True) +``` + +## Best Practices + +### Format Selection + +**Use Parquet when:** +- Need compression (up to 10x smaller than CSV) +- Want fast reads/writes +- Need to preserve data types +- Working with large datasets +- Need predicate pushdown + +**Use CSV when:** +- Need human-readable format +- Interfacing with legacy systems +- Data is small +- Need universal compatibility + +**Use JSON when:** +- Working with nested/hierarchical data +- Need web API compatibility +- Data has flexible schema + +**Use Arrow IPC when:** +- Need zero-copy data sharing +- Fastest serialization required +- Working between Arrow-compatible systems + +### Reading Large Files + +```python +# 1. Always use lazy mode +lf = pl.scan_csv("large.csv") # NOT read_csv + +# 2. Filter and select early (pushdown optimization) +result = ( + lf + .select("col1", "col2", "col3") # Only needed columns + .filter(pl.col("date") > "2023-01-01") # Filter early + .collect() +) + +# 3. Use streaming for very large data +result = lf.filter(...).select(...).collect(streaming=True) + +# 4. Read only needed rows during development +df = pl.read_csv("large.csv", n_rows=10000) # Sample for testing +``` + +### Writing Large Files + +```python +# 1. Use Parquet with compression +df.write_parquet("output.parquet", compression="zstd") + +# 2. Use partitioning for very large datasets +df.write_parquet("output", partition_by=["year", "month"]) + +# 3. Write streaming +lf = pl.scan_csv("input.csv") +lf.sink_parquet("output.parquet") # Streaming write +``` + +### Performance Tips + +```python +# 1. Specify dtypes when reading CSV +df = pl.read_csv( + "data.csv", + dtypes={"id": pl.Int64, "name": pl.Utf8} # Avoids inference +) + +# 2. Use appropriate compression +df.write_parquet("output.parquet", compression="snappy") # Fast +df.write_parquet("output.parquet", compression="zstd") # Better compression + +# 3. Parallel reading +df = pl.read_csv("data.csv", parallel="auto") + +# 4. Read multiple files in parallel +lf = pl.scan_parquet("data/*.parquet") # Automatic parallel read +``` + +## Error Handling + +```python +try: + df = pl.read_csv("data.csv") +except pl.exceptions.ComputeError as e: + print(f"Error reading CSV: {e}") + +# Ignore errors during parsing +df = pl.read_csv("messy.csv", ignore_errors=True) + +# Handle missing files +from pathlib import Path +if Path("data.csv").exists(): + df = pl.read_csv("data.csv") +else: + print("File not found") +``` + +## Schema Management + +```python +# Infer schema from sample +schema = pl.read_csv("data.csv", n_rows=1000).schema + +# Use inferred schema for full read +df = pl.read_csv("data.csv", dtypes=schema) + +# Define schema explicitly +schema = { + "id": pl.Int64, + "name": pl.Utf8, + "date": pl.Date, + "value": pl.Float64 +} +df = pl.read_csv("data.csv", dtypes=schema) +``` diff --git a/components/skills/data-analysis-polars-dev/references/operations.md b/components/skills/data-analysis-polars-dev/references/operations.md new file mode 100644 index 0000000..40441f5 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/operations.md @@ -0,0 +1,602 @@ +# Polars Operations Reference + +This reference covers all common Polars operations with comprehensive examples. + +## Selection Operations + +### Select Columns + +**Basic selection:** +```python +# Select specific columns +df.select("name", "age", "city") + +# Using expressions +df.select(pl.col("name"), pl.col("age")) +``` + +**Pattern-based selection:** +```python +# All columns starting with "sales_" +df.select(pl.col("^sales_.*$")) + +# All numeric columns +df.select(pl.col(pl.NUMERIC_DTYPES)) + +# All columns except specific ones +df.select(pl.all().exclude("id", "timestamp")) +``` + +**Computed columns:** +```python +df.select( + "name", + (pl.col("age") * 12).alias("age_in_months"), + (pl.col("salary") * 1.1).alias("salary_after_raise") +) +``` + +### With Columns (Add/Modify) + +Add new columns or modify existing ones while preserving all other columns: + +```python +# Add new columns +df.with_columns( + age_doubled=pl.col("age") * 2, + full_name=pl.col("first_name") + " " + pl.col("last_name") +) + +# Modify existing columns +df.with_columns( + pl.col("name").str.to_uppercase().alias("name"), + pl.col("salary").cast(pl.Float64).alias("salary") +) + +# Multiple operations in parallel +df.with_columns( + pl.col("value") * 10, + pl.col("value") * 100, + pl.col("value") * 1000, +) +``` + +## Filtering Operations + +### Basic Filtering + +```python +# Single condition +df.filter(pl.col("age") > 25) + +# Multiple conditions (AND) +df.filter( + pl.col("age") > 25, + pl.col("city") == "NY" +) + +# OR conditions +df.filter( + (pl.col("age") > 30) | (pl.col("salary") > 100000) +) + +# NOT condition +df.filter(~pl.col("active")) +df.filter(pl.col("city") != "NY") +``` + +### Advanced Filtering + +**String operations:** +```python +# Contains substring +df.filter(pl.col("name").str.contains("John")) + +# Starts with +df.filter(pl.col("email").str.starts_with("admin")) + +# Regex match +df.filter(pl.col("phone").str.contains(r"^\d{3}-\d{3}-\d{4}$")) +``` + +**Membership checks:** +```python +# In list +df.filter(pl.col("city").is_in(["NY", "LA", "SF"])) + +# Not in list +df.filter(~pl.col("status").is_in(["inactive", "deleted"])) +``` + +**Range filters:** +```python +# Between values +df.filter(pl.col("age").is_between(25, 35)) + +# Date range +df.filter( + pl.col("date") >= pl.date(2023, 1, 1), + pl.col("date") <= pl.date(2023, 12, 31) +) +``` + +**Null filtering:** +```python +# Filter out nulls +df.filter(pl.col("value").is_not_null()) + +# Keep only nulls +df.filter(pl.col("value").is_null()) +``` + +## Grouping and Aggregation + +### Basic Group By + +```python +# Group by single column +df.group_by("department").agg( + pl.col("salary").mean().alias("avg_salary"), + pl.len().alias("employee_count") +) + +# Group by multiple columns +df.group_by("department", "location").agg( + pl.col("salary").sum() +) + +# Maintain order +df.group_by("category", maintain_order=True).agg( + pl.col("value").sum() +) +``` + +### Aggregation Functions + +**Count and length:** +```python +df.group_by("category").agg( + pl.len().alias("count"), + pl.col("id").count().alias("non_null_count"), + pl.col("id").n_unique().alias("unique_count") +) +``` + +**Statistical aggregations:** +```python +df.group_by("group").agg( + pl.col("value").sum().alias("total"), + pl.col("value").mean().alias("average"), + pl.col("value").median().alias("median"), + pl.col("value").std().alias("std_dev"), + pl.col("value").var().alias("variance"), + pl.col("value").min().alias("minimum"), + pl.col("value").max().alias("maximum"), + pl.col("value").quantile(0.95).alias("p95") +) +``` + +**First and last:** +```python +df.group_by("user_id").agg( + pl.col("timestamp").first().alias("first_seen"), + pl.col("timestamp").last().alias("last_seen"), + pl.col("event").first().alias("first_event") +) +``` + +**List aggregation:** +```python +# Collect values into lists +df.group_by("category").agg( + pl.col("item").alias("all_items") # Creates list column +) +``` + +### Conditional Aggregations + +Filter within aggregations: + +```python +df.group_by("department").agg( + # Count high earners + (pl.col("salary") > 100000).sum().alias("high_earners"), + + # Average of filtered values + pl.col("salary").filter(pl.col("bonus") > 0).mean().alias("avg_with_bonus"), + + # Conditional sum + pl.when(pl.col("active")) + .then(pl.col("sales")) + .otherwise(0) + .sum() + .alias("active_sales") +) +``` + +### Multiple Aggregations + +Combine multiple aggregations efficiently: + +```python +df.group_by("store_id").agg( + pl.col("transaction_id").count().alias("num_transactions"), + pl.col("amount").sum().alias("total_sales"), + pl.col("amount").mean().alias("avg_transaction"), + pl.col("customer_id").n_unique().alias("unique_customers"), + pl.col("amount").max().alias("largest_transaction"), + pl.col("timestamp").min().alias("first_transaction_date"), + pl.col("timestamp").max().alias("last_transaction_date") +) +``` + +## Window Functions + +Window functions apply aggregations while preserving the original row count. + +### Basic Window Operations + +**Group statistics:** +```python +# Add group mean to each row +df.with_columns( + avg_age_by_dept=pl.col("age").mean().over("department") +) + +# Multiple group columns +df.with_columns( + group_avg=pl.col("value").mean().over("category", "region") +) +``` + +**Ranking:** +```python +df.with_columns( + # Rank within groups + rank=pl.col("score").rank().over("team"), + + # Dense rank (no gaps) + dense_rank=pl.col("score").rank(method="dense").over("team"), + + # Row number + row_num=pl.col("timestamp").sort().rank(method="ordinal").over("user_id") +) +``` + +### Window Mapping Strategies + +**group_to_rows (default):** +Preserves original row order: +```python +df.with_columns( + group_mean=pl.col("value").mean().over("category", mapping_strategy="group_to_rows") +) +``` + +**explode:** +Faster, groups rows together: +```python +df.with_columns( + group_mean=pl.col("value").mean().over("category", mapping_strategy="explode") +) +``` + +**join:** +Creates list columns: +```python +df.with_columns( + group_values=pl.col("value").over("category", mapping_strategy="join") +) +``` + +### Rolling Windows + +**Time-based rolling:** +```python +df.with_columns( + rolling_avg=pl.col("value").rolling_mean( + window_size="7d", + by="date" + ) +) +``` + +**Row-based rolling:** +```python +df.with_columns( + rolling_sum=pl.col("value").rolling_sum(window_size=3), + rolling_max=pl.col("value").rolling_max(window_size=5) +) +``` + +### Cumulative Operations + +```python +df.with_columns( + cumsum=pl.col("value").cum_sum().over("group"), + cummax=pl.col("value").cum_max().over("group"), + cummin=pl.col("value").cum_min().over("group"), + cumprod=pl.col("value").cum_prod().over("group") +) +``` + +### Shift and Lag/Lead + +```python +df.with_columns( + # Previous value (lag) + prev_value=pl.col("value").shift(1).over("user_id"), + + # Next value (lead) + next_value=pl.col("value").shift(-1).over("user_id"), + + # Calculate difference from previous + diff=pl.col("value") - pl.col("value").shift(1).over("user_id") +) +``` + +## Sorting + +### Basic Sorting + +```python +# Sort by single column +df.sort("age") + +# Sort descending +df.sort("age", descending=True) + +# Sort by multiple columns +df.sort("department", "age") + +# Mixed sorting order +df.sort(["department", "salary"], descending=[False, True]) +``` + +### Advanced Sorting + +**Null handling:** +```python +# Nulls first +df.sort("value", nulls_last=False) + +# Nulls last +df.sort("value", nulls_last=True) +``` + +**Sort by expression:** +```python +# Sort by computed value +df.sort(pl.col("first_name").str.len()) + +# Sort by multiple expressions +df.sort( + pl.col("last_name").str.to_lowercase(), + pl.col("age").abs() +) +``` + +## Conditional Operations + +### When/Then/Otherwise + +```python +# Basic conditional +df.with_columns( + status=pl.when(pl.col("age") >= 18) + .then("adult") + .otherwise("minor") +) + +# Multiple conditions +df.with_columns( + category=pl.when(pl.col("score") >= 90) + .then("A") + .when(pl.col("score") >= 80) + .then("B") + .when(pl.col("score") >= 70) + .then("C") + .otherwise("F") +) + +# Conditional computation +df.with_columns( + adjusted_price=pl.when(pl.col("is_member")) + .then(pl.col("price") * 0.9) + .otherwise(pl.col("price")) +) +``` + +## String Operations + +### Common String Methods + +```python +df.with_columns( + # Case conversion + upper=pl.col("name").str.to_uppercase(), + lower=pl.col("name").str.to_lowercase(), + title=pl.col("name").str.to_titlecase(), + + # Trimming + trimmed=pl.col("text").str.strip_chars(), + + # Substring + first_3=pl.col("name").str.slice(0, 3), + + # Replace + cleaned=pl.col("text").str.replace("old", "new"), + cleaned_all=pl.col("text").str.replace_all("old", "new"), + + # Split + parts=pl.col("full_name").str.split(" "), + + # Length + name_length=pl.col("name").str.len_chars() +) +``` + +### String Filtering + +```python +# Contains +df.filter(pl.col("email").str.contains("@gmail.com")) + +# Starts/ends with +df.filter(pl.col("name").str.starts_with("A")) +df.filter(pl.col("file").str.ends_with(".csv")) + +# Regex matching +df.filter(pl.col("phone").str.contains(r"^\d{3}-\d{4}$")) +``` + +## Date and Time Operations + +### Date Parsing + +```python +# Parse strings to dates +df.with_columns( + date=pl.col("date_str").str.strptime(pl.Date, "%Y-%m-%d"), + datetime=pl.col("dt_str").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S") +) +``` + +### Date Components + +```python +df.with_columns( + year=pl.col("date").dt.year(), + month=pl.col("date").dt.month(), + day=pl.col("date").dt.day(), + weekday=pl.col("date").dt.weekday(), + hour=pl.col("datetime").dt.hour(), + minute=pl.col("datetime").dt.minute() +) +``` + +### Date Arithmetic + +```python +# Add duration +df.with_columns( + next_week=pl.col("date") + pl.duration(weeks=1), + next_month=pl.col("date") + pl.duration(months=1) +) + +# Difference between dates +df.with_columns( + days_diff=(pl.col("end_date") - pl.col("start_date")).dt.total_days() +) +``` + +### Date Filtering + +```python +# Filter by date range +df.filter( + pl.col("date").is_between(pl.date(2023, 1, 1), pl.date(2023, 12, 31)) +) + +# Filter by year +df.filter(pl.col("date").dt.year() == 2023) + +# Filter by month +df.filter(pl.col("date").dt.month().is_in([6, 7, 8])) # Summer months +``` + +## List Operations + +### Working with List Columns + +```python +# Create list column +df.with_columns( + items_list=pl.col("item1", "item2", "item3").to_list() +) + +# List operations +df.with_columns( + list_len=pl.col("items").list.len(), + first_item=pl.col("items").list.first(), + last_item=pl.col("items").list.last(), + unique_items=pl.col("items").list.unique(), + sorted_items=pl.col("items").list.sort() +) + +# Explode lists to rows +df.explode("items") + +# Filter list elements +df.with_columns( + filtered=pl.col("items").list.eval(pl.element() > 10) +) +``` + +## Struct Operations + +### Working with Nested Structures + +```python +# Create struct column +df.with_columns( + address=pl.struct(["street", "city", "zip"]) +) + +# Access struct fields +df.with_columns( + city=pl.col("address").struct.field("city") +) + +# Unnest struct to columns +df.unnest("address") +``` + +## Unique and Duplicate Operations + +```python +# Get unique rows +df.unique() + +# Unique on specific columns +df.unique(subset=["name", "email"]) + +# Keep first/last duplicate +df.unique(subset=["id"], keep="first") +df.unique(subset=["id"], keep="last") + +# Identify duplicates +df.with_columns( + is_duplicate=pl.col("id").is_duplicated() +) + +# Count duplicates +df.group_by("email").agg( + pl.len().alias("count") +).filter(pl.col("count") > 1) +``` + +## Sampling + +```python +# Random sample +df.sample(n=100) + +# Sample fraction +df.sample(fraction=0.1) + +# Sample with seed for reproducibility +df.sample(n=100, seed=42) +``` + +## Column Renaming + +```python +# Rename specific columns +df.rename({"old_name": "new_name", "age": "years"}) + +# Rename with expression +df.select(pl.col("*").name.suffix("_renamed")) +df.select(pl.col("*").name.prefix("data_")) +df.select(pl.col("*").name.to_uppercase()) +``` diff --git a/components/skills/data-analysis-polars-dev/references/pandas_migration.md b/components/skills/data-analysis-polars-dev/references/pandas_migration.md new file mode 100644 index 0000000..aa5fd24 --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/pandas_migration.md @@ -0,0 +1,417 @@ +# Pandas to Polars Migration Guide + +This guide helps you migrate from pandas to Polars with comprehensive operation mappings and key differences. + +## Core Conceptual Differences + +### 1. No Index System + +**Pandas:** Uses row-based indexing system +```python +df.loc[0, "column"] +df.iloc[0:5] +df.set_index("id") +``` + +**Polars:** Uses integer positions only +```python +df[0, "column"] # Row position, column name +df[0:5] # Row slice +# No set_index equivalent - use group_by instead +``` + +### 2. Memory Format + +**Pandas:** Row-oriented NumPy arrays +**Polars:** Columnar Apache Arrow format + +**Implications:** +- Polars is faster for column operations +- Polars uses less memory +- Polars has better data sharing capabilities + +### 3. Parallelization + +**Pandas:** Primarily single-threaded (requires Dask for parallelism) +**Polars:** Parallel by default using Rust's concurrency + +### 4. Lazy Evaluation + +**Pandas:** Only eager evaluation +**Polars:** Both eager (DataFrame) and lazy (LazyFrame) with query optimization + +### 5. Type Strictness + +**Pandas:** Allows silent type conversions +**Polars:** Strict typing, explicit casts required + +**Example:** +```python +# Pandas: Silently converts to float +pd_df["int_col"] = [1, 2, None, 4] # dtype: float64 + +# Polars: Keeps as integer with null +pl_df = pl.DataFrame({"int_col": [1, 2, None, 4]}) # dtype: Int64 +``` + +## Operation Mappings + +### Data Selection + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Select column | `df["col"]` or `df.col` | `df.select("col")` or `df["col"]` | +| Select multiple | `df[["a", "b"]]` | `df.select("a", "b")` | +| Select by position | `df.iloc[:, 0:3]` | `df.select(pl.col(df.columns[0:3]))` | +| Select by condition | `df[df["age"] > 25]` | `df.filter(pl.col("age") > 25)` | + +### Data Filtering + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Single condition | `df[df["age"] > 25]` | `df.filter(pl.col("age") > 25)` | +| Multiple conditions | `df[(df["age"] > 25) & (df["city"] == "NY")]` | `df.filter(pl.col("age") > 25, pl.col("city") == "NY")` | +| Query method | `df.query("age > 25")` | `df.filter(pl.col("age") > 25)` | +| isin | `df[df["city"].isin(["NY", "LA"])]` | `df.filter(pl.col("city").is_in(["NY", "LA"]))` | +| isna | `df[df["value"].isna()]` | `df.filter(pl.col("value").is_null())` | +| notna | `df[df["value"].notna()]` | `df.filter(pl.col("value").is_not_null())` | + +### Adding/Modifying Columns + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Add column | `df["new"] = df["old"] * 2` | `df.with_columns(new=pl.col("old") * 2)` | +| Multiple columns | `df.assign(a=..., b=...)` | `df.with_columns(a=..., b=...)` | +| Conditional column | `np.where(condition, a, b)` | `pl.when(condition).then(a).otherwise(b)` | + +**Important difference - Parallel execution:** + +```python +# Pandas: Sequential (lambda sees previous results) +df.assign( + a=lambda df_: df_.value * 10, + b=lambda df_: df_.value * 100 +) + +# Polars: Parallel (all computed together) +df.with_columns( + a=pl.col("value") * 10, + b=pl.col("value") * 100 +) +``` + +### Grouping and Aggregation + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Group by | `df.groupby("col")` | `df.group_by("col")` | +| Agg single | `df.groupby("col")["val"].mean()` | `df.group_by("col").agg(pl.col("val").mean())` | +| Agg multiple | `df.groupby("col").agg({"val": ["mean", "sum"]})` | `df.group_by("col").agg(pl.col("val").mean(), pl.col("val").sum())` | +| Size | `df.groupby("col").size()` | `df.group_by("col").agg(pl.len())` | +| Count | `df.groupby("col").count()` | `df.group_by("col").agg(pl.col("*").count())` | + +### Window Functions + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Transform | `df.groupby("col").transform("mean")` | `df.with_columns(pl.col("val").mean().over("col"))` | +| Rank | `df.groupby("col")["val"].rank()` | `df.with_columns(pl.col("val").rank().over("col"))` | +| Shift | `df.groupby("col")["val"].shift(1)` | `df.with_columns(pl.col("val").shift(1).over("col"))` | +| Cumsum | `df.groupby("col")["val"].cumsum()` | `df.with_columns(pl.col("val").cum_sum().over("col"))` | + +### Joins + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Inner join | `df1.merge(df2, on="id")` | `df1.join(df2, on="id", how="inner")` | +| Left join | `df1.merge(df2, on="id", how="left")` | `df1.join(df2, on="id", how="left")` | +| Different keys | `df1.merge(df2, left_on="a", right_on="b")` | `df1.join(df2, left_on="a", right_on="b")` | + +### Concatenation + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Vertical | `pd.concat([df1, df2], axis=0)` | `pl.concat([df1, df2], how="vertical")` | +| Horizontal | `pd.concat([df1, df2], axis=1)` | `pl.concat([df1, df2], how="horizontal")` | + +### Sorting + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Sort by column | `df.sort_values("col")` | `df.sort("col")` | +| Descending | `df.sort_values("col", ascending=False)` | `df.sort("col", descending=True)` | +| Multiple columns | `df.sort_values(["a", "b"])` | `df.sort("a", "b")` | + +### Reshaping + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Pivot | `df.pivot(index="a", columns="b", values="c")` | `df.pivot(values="c", index="a", columns="b")` | +| Melt | `df.melt(id_vars="id")` | `df.unpivot(index="id")` | + +### I/O Operations + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Read CSV | `pd.read_csv("file.csv")` | `pl.read_csv("file.csv")` or `pl.scan_csv()` | +| Write CSV | `df.to_csv("file.csv")` | `df.write_csv("file.csv")` | +| Read Parquet | `pd.read_parquet("file.parquet")` | `pl.read_parquet("file.parquet")` | +| Write Parquet | `df.to_parquet("file.parquet")` | `df.write_parquet("file.parquet")` | +| Read Excel | `pd.read_excel("file.xlsx")` | `pl.read_excel("file.xlsx")` | + +### String Operations + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Upper | `df["col"].str.upper()` | `df.select(pl.col("col").str.to_uppercase())` | +| Lower | `df["col"].str.lower()` | `df.select(pl.col("col").str.to_lowercase())` | +| Contains | `df["col"].str.contains("pattern")` | `df.filter(pl.col("col").str.contains("pattern"))` | +| Replace | `df["col"].str.replace("old", "new")` | `df.select(pl.col("col").str.replace("old", "new"))` | +| Split | `df["col"].str.split(" ")` | `df.select(pl.col("col").str.split(" "))` | + +### Datetime Operations + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Parse dates | `pd.to_datetime(df["col"])` | `df.select(pl.col("col").str.strptime(pl.Date, "%Y-%m-%d"))` | +| Year | `df["date"].dt.year` | `df.select(pl.col("date").dt.year())` | +| Month | `df["date"].dt.month` | `df.select(pl.col("date").dt.month())` | +| Day | `df["date"].dt.day` | `df.select(pl.col("date").dt.day())` | + +### Missing Data + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Drop nulls | `df.dropna()` | `df.drop_nulls()` | +| Fill nulls | `df.fillna(0)` | `df.fill_null(0)` | +| Check null | `df["col"].isna()` | `df.select(pl.col("col").is_null())` | +| Forward fill | `df.fillna(method="ffill")` | `df.select(pl.col("col").fill_null(strategy="forward"))` | + +### Other Operations + +| Operation | Pandas | Polars | +|-----------|--------|--------| +| Unique values | `df["col"].unique()` | `df["col"].unique()` | +| Value counts | `df["col"].value_counts()` | `df["col"].value_counts()` | +| Describe | `df.describe()` | `df.describe()` | +| Sample | `df.sample(n=100)` | `df.sample(n=100)` | +| Head | `df.head()` | `df.head()` | +| Tail | `df.tail()` | `df.tail()` | + +## Common Migration Patterns + +### Pattern 1: Chained Operations + +**Pandas:** +```python +result = (df + .assign(new_col=lambda x: x["old_col"] * 2) + .query("new_col > 10") + .groupby("category") + .agg({"value": "sum"}) + .reset_index() +) +``` + +**Polars:** +```python +result = (df + .with_columns(new_col=pl.col("old_col") * 2) + .filter(pl.col("new_col") > 10) + .group_by("category") + .agg(pl.col("value").sum()) +) +# No reset_index needed - Polars doesn't have index +``` + +### Pattern 2: Apply Functions + +**Pandas:** +```python +# Avoid in Polars - breaks parallelization +df["result"] = df["value"].apply(lambda x: x * 2) +``` + +**Polars:** +```python +# Use expressions instead +df = df.with_columns(result=pl.col("value") * 2) + +# If custom function needed +df = df.with_columns( + result=pl.col("value").map_elements(lambda x: x * 2, return_dtype=pl.Float64) +) +``` + +### Pattern 3: Conditional Column Creation + +**Pandas:** +```python +df["category"] = np.where( + df["value"] > 100, + "high", + np.where(df["value"] > 50, "medium", "low") +) +``` + +**Polars:** +```python +df = df.with_columns( + category=pl.when(pl.col("value") > 100) + .then("high") + .when(pl.col("value") > 50) + .then("medium") + .otherwise("low") +) +``` + +### Pattern 4: Group Transform + +**Pandas:** +```python +df["group_mean"] = df.groupby("category")["value"].transform("mean") +``` + +**Polars:** +```python +df = df.with_columns( + group_mean=pl.col("value").mean().over("category") +) +``` + +### Pattern 5: Multiple Aggregations + +**Pandas:** +```python +result = df.groupby("category").agg({ + "value": ["mean", "sum", "count"], + "price": ["min", "max"] +}) +``` + +**Polars:** +```python +result = df.group_by("category").agg( + pl.col("value").mean().alias("value_mean"), + pl.col("value").sum().alias("value_sum"), + pl.col("value").count().alias("value_count"), + pl.col("price").min().alias("price_min"), + pl.col("price").max().alias("price_max") +) +``` + +## Performance Anti-Patterns to Avoid + +### Anti-Pattern 1: Sequential Pipe Operations + +**Bad (disables parallelization):** +```python +df = df.pipe(function1).pipe(function2).pipe(function3) +``` + +**Good (enables parallelization):** +```python +df = df.with_columns( + function1_result(), + function2_result(), + function3_result() +) +``` + +### Anti-Pattern 2: Python Functions in Hot Paths + +**Bad:** +```python +df = df.with_columns( + result=pl.col("value").map_elements(lambda x: x * 2) +) +``` + +**Good:** +```python +df = df.with_columns(result=pl.col("value") * 2) +``` + +### Anti-Pattern 3: Using Eager Reading for Large Files + +**Bad:** +```python +df = pl.read_csv("large_file.csv") +result = df.filter(pl.col("age") > 25).select("name", "age") +``` + +**Good:** +```python +lf = pl.scan_csv("large_file.csv") +result = lf.filter(pl.col("age") > 25).select("name", "age").collect() +``` + +### Anti-Pattern 4: Row Iteration + +**Bad:** +```python +for row in df.iter_rows(): + # Process row + pass +``` + +**Good:** +```python +# Use vectorized operations +df = df.with_columns( + # Vectorized computation +) +``` + +## Migration Checklist + +When migrating from pandas to Polars: + +1. **Remove index operations** - Use integer positions or group_by +2. **Replace apply/map with expressions** - Use Polars native operations +3. **Update column assignment** - Use `with_columns()` instead of direct assignment +4. **Change groupby.transform to .over()** - Window functions work differently +5. **Update string operations** - Use `.str.to_uppercase()` instead of `.str.upper()` +6. **Add explicit type casts** - Polars won't silently convert types +7. **Consider lazy evaluation** - Use `scan_*` instead of `read_*` for large data +8. **Update aggregation syntax** - More explicit in Polars +9. **Remove reset_index calls** - Not needed in Polars +10. **Update conditional logic** - Use `when().then().otherwise()` pattern + +## Compatibility Layer + +For gradual migration, you can use both libraries: + +```python +import pandas as pd +import polars as pl + +# Convert pandas to Polars +pl_df = pl.from_pandas(pd_df) + +# Convert Polars to pandas +pd_df = pl_df.to_pandas() + +# Use Arrow for zero-copy (when possible) +pl_df = pl.from_arrow(pd_df) +pd_df = pl_df.to_arrow().to_pandas() +``` + +## When to Stick with Pandas + +Consider staying with pandas when: +- Working with time series requiring complex index operations +- Need extensive ecosystem support (some libraries only support pandas) +- Team lacks Rust/Polars expertise +- Data is small and performance isn't critical +- Using advanced pandas features without Polars equivalents + +## When to Switch to Polars + +Switch to Polars when: +- Performance is critical +- Working with large datasets (>1GB) +- Need lazy evaluation and query optimization +- Want better type safety +- Need parallel execution by default +- Starting a new project diff --git a/components/skills/data-analysis-polars-dev/references/transformations.md b/components/skills/data-analysis-polars-dev/references/transformations.md new file mode 100644 index 0000000..af57f1c --- /dev/null +++ b/components/skills/data-analysis-polars-dev/references/transformations.md @@ -0,0 +1,549 @@ +# Polars Data Transformations + +Comprehensive guide to joins, concatenation, and reshaping operations in Polars. + +## Joins + +Joins combine data from multiple DataFrames based on common columns. + +### Basic Join Types + +**Inner Join (intersection):** +```python +# Keep only matching rows from both DataFrames +result = df1.join(df2, on="id", how="inner") +``` + +**Left Join (all left + matches from right):** +```python +# Keep all rows from left, add matching rows from right +result = df1.join(df2, on="id", how="left") +``` + +**Outer Join (union):** +```python +# Keep all rows from both DataFrames +result = df1.join(df2, on="id", how="outer") +``` + +**Cross Join (Cartesian product):** +```python +# Every row from left with every row from right +result = df1.join(df2, how="cross") +``` + +**Semi Join (filtered left):** +```python +# Keep only left rows that have a match in right +result = df1.join(df2, on="id", how="semi") +``` + +**Anti Join (non-matching left):** +```python +# Keep only left rows that DON'T have a match in right +result = df1.join(df2, on="id", how="anti") +``` + +### Join Syntax Variations + +**Single column join:** +```python +df1.join(df2, on="id") +``` + +**Multiple columns join:** +```python +df1.join(df2, on=["id", "date"]) +``` + +**Different column names:** +```python +df1.join(df2, left_on="user_id", right_on="id") +``` + +**Multiple different columns:** +```python +df1.join( + df2, + left_on=["user_id", "date"], + right_on=["id", "timestamp"] +) +``` + +### Suffix Handling + +When both DataFrames have columns with the same name (other than join keys): + +```python +# Add suffixes to distinguish columns +result = df1.join(df2, on="id", suffix="_right") + +# Results in: value, value_right (if both had "value" column) +``` + +### Join Examples + +**Example 1: Customer Orders** +```python +customers = pl.DataFrame({ + "customer_id": [1, 2, 3, 4], + "name": ["Alice", "Bob", "Charlie", "David"] +}) + +orders = pl.DataFrame({ + "order_id": [101, 102, 103], + "customer_id": [1, 2, 1], + "amount": [100, 200, 150] +}) + +# Inner join - only customers with orders +result = customers.join(orders, on="customer_id", how="inner") + +# Left join - all customers, even without orders +result = customers.join(orders, on="customer_id", how="left") +``` + +**Example 2: Time-series data** +```python +prices = pl.DataFrame({ + "date": ["2023-01-01", "2023-01-02", "2023-01-03"], + "stock": ["AAPL", "AAPL", "AAPL"], + "price": [150, 152, 151] +}) + +volumes = pl.DataFrame({ + "date": ["2023-01-01", "2023-01-02"], + "stock": ["AAPL", "AAPL"], + "volume": [1000000, 1100000] +}) + +result = prices.join( + volumes, + on=["date", "stock"], + how="left" +) +``` + +### Asof Joins (Nearest Match) + +For time-series data, join to nearest timestamp: + +```python +# Join to nearest earlier timestamp +quotes = pl.DataFrame({ + "timestamp": [1, 2, 3, 4, 5], + "stock": ["A", "A", "A", "A", "A"], + "quote": [100, 101, 102, 103, 104] +}) + +trades = pl.DataFrame({ + "timestamp": [1.5, 3.5, 4.2], + "stock": ["A", "A", "A"], + "trade": [50, 75, 100] +}) + +result = trades.join_asof( + quotes, + on="timestamp", + by="stock", + strategy="backward" # or "forward", "nearest" +) +``` + +## Concatenation + +Concatenation stacks DataFrames together. + +### Vertical Concatenation (Stack Rows) + +```python +df1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) +df2 = pl.DataFrame({"a": [5, 6], "b": [7, 8]}) + +# Stack rows +result = pl.concat([df1, df2], how="vertical") +# Result: 4 rows, same columns +``` + +**Handling mismatched schemas:** +```python +df1 = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) +df2 = pl.DataFrame({"a": [5, 6], "c": [7, 8]}) + +# Diagonal concat - fills missing columns with nulls +result = pl.concat([df1, df2], how="diagonal") +# Result: columns a, b, c (with nulls where not present) +``` + +### Horizontal Concatenation (Stack Columns) + +```python +df1 = pl.DataFrame({"a": [1, 2, 3]}) +df2 = pl.DataFrame({"b": [4, 5, 6]}) + +# Stack columns +result = pl.concat([df1, df2], how="horizontal") +# Result: 3 rows, columns a and b +``` + +**Note:** Horizontal concat requires same number of rows. + +### Concatenation Options + +```python +# Rechunk after concatenation (better performance for subsequent operations) +result = pl.concat([df1, df2], rechunk=True) + +# Parallel execution +result = pl.concat([df1, df2], parallel=True) +``` + +### Use Cases + +**Combining data from multiple sources:** +```python +# Read multiple files and concatenate +files = ["data_2023.csv", "data_2024.csv", "data_2025.csv"] +dfs = [pl.read_csv(f) for f in files] +combined = pl.concat(dfs, how="vertical") +``` + +**Adding computed columns:** +```python +base = pl.DataFrame({"value": [1, 2, 3]}) +computed = pl.DataFrame({"doubled": [2, 4, 6]}) +result = pl.concat([base, computed], how="horizontal") +``` + +## Pivoting (Wide Format) + +Convert unique values from one column into multiple columns. + +### Basic Pivot + +```python +df = pl.DataFrame({ + "date": ["2023-01", "2023-01", "2023-02", "2023-02"], + "product": ["A", "B", "A", "B"], + "sales": [100, 150, 120, 160] +}) + +# Pivot: products become columns +pivoted = df.pivot( + values="sales", + index="date", + columns="product" +) +# Result: +# date | A | B +# 2023-01 | 100 | 150 +# 2023-02 | 120 | 160 +``` + +### Pivot with Aggregation + +When there are duplicate combinations, aggregate: + +```python +df = pl.DataFrame({ + "date": ["2023-01", "2023-01", "2023-01"], + "product": ["A", "A", "B"], + "sales": [100, 110, 150] +}) + +# Aggregate duplicates +pivoted = df.pivot( + values="sales", + index="date", + columns="product", + aggregate_function="sum" # or "mean", "max", "min", etc. +) +``` + +### Multiple Index Columns + +```python +df = pl.DataFrame({ + "region": ["North", "North", "South", "South"], + "date": ["2023-01", "2023-01", "2023-01", "2023-01"], + "product": ["A", "B", "A", "B"], + "sales": [100, 150, 120, 160] +}) + +pivoted = df.pivot( + values="sales", + index=["region", "date"], + columns="product" +) +``` + +## Unpivoting/Melting (Long Format) + +Convert multiple columns into rows (opposite of pivot). + +### Basic Unpivot + +```python +df = pl.DataFrame({ + "date": ["2023-01", "2023-02"], + "product_A": [100, 120], + "product_B": [150, 160] +}) + +# Unpivot: convert columns to rows +unpivoted = df.unpivot( + index="date", + on=["product_A", "product_B"] +) +# Result: +# date | variable | value +# 2023-01 | product_A | 100 +# 2023-01 | product_B | 150 +# 2023-02 | product_A | 120 +# 2023-02 | product_B | 160 +``` + +### Custom Column Names + +```python +unpivoted = df.unpivot( + index="date", + on=["product_A", "product_B"], + variable_name="product", + value_name="sales" +) +``` + +### Unpivot by Pattern + +```python +# Unpivot all columns matching pattern +df = pl.DataFrame({ + "id": [1, 2], + "sales_Q1": [100, 200], + "sales_Q2": [150, 250], + "sales_Q3": [120, 220], + "revenue_Q1": [1000, 2000] +}) + +# Unpivot all sales columns +unpivoted = df.unpivot( + index="id", + on=pl.col("^sales_.*$") +) +``` + +## Exploding (Unnesting Lists) + +Convert list columns into multiple rows. + +### Basic Explode + +```python +df = pl.DataFrame({ + "id": [1, 2], + "values": [[1, 2, 3], [4, 5]] +}) + +# Explode list into rows +exploded = df.explode("values") +# Result: +# id | values +# 1 | 1 +# 1 | 2 +# 1 | 3 +# 2 | 4 +# 2 | 5 +``` + +### Multiple Column Explode + +```python +df = pl.DataFrame({ + "id": [1, 2], + "letters": [["a", "b"], ["c", "d"]], + "numbers": [[1, 2], [3, 4]] +}) + +# Explode multiple columns (must be same length) +exploded = df.explode("letters", "numbers") +``` + +## Transposing + +Swap rows and columns: + +```python +df = pl.DataFrame({ + "metric": ["sales", "costs", "profit"], + "Q1": [100, 60, 40], + "Q2": [150, 80, 70] +}) + +# Transpose +transposed = df.transpose( + include_header=True, + header_name="quarter", + column_names="metric" +) +# Result: quarters as rows, metrics as columns +``` + +## Reshaping Patterns + +### Pattern 1: Wide to Long to Wide + +```python +# Start wide +wide = pl.DataFrame({ + "id": [1, 2], + "A": [10, 20], + "B": [30, 40] +}) + +# To long +long = wide.unpivot(index="id", on=["A", "B"]) + +# Back to wide (maybe with transformations) +wide_again = long.pivot(values="value", index="id", columns="variable") +``` + +### Pattern 2: Nested to Flat + +```python +# Nested data +df = pl.DataFrame({ + "user": [1, 2], + "purchases": [ + [{"item": "A", "qty": 2}, {"item": "B", "qty": 1}], + [{"item": "C", "qty": 3}] + ] +}) + +# Explode and unnest +flat = ( + df.explode("purchases") + .unnest("purchases") +) +``` + +### Pattern 3: Aggregation to Pivot + +```python +# Raw data +sales = pl.DataFrame({ + "date": ["2023-01", "2023-01", "2023-02"], + "product": ["A", "B", "A"], + "sales": [100, 150, 120] +}) + +# Aggregate then pivot +result = ( + sales + .group_by("date", "product") + .agg(pl.col("sales").sum()) + .pivot(values="sales", index="date", columns="product") +) +``` + +## Advanced Transformations + +### Conditional Reshaping + +```python +# Pivot only certain values +df.filter(pl.col("year") >= 2020).pivot(...) + +# Unpivot with filtering +df.unpivot(index="id", on=pl.col("^sales.*$")) +``` + +### Multi-level Transformations + +```python +# Complex reshaping pipeline +result = ( + df + .unpivot(index="id", on=pl.col("^Q[0-9]_.*$")) + .with_columns( + quarter=pl.col("variable").str.extract(r"Q([0-9])", 1), + metric=pl.col("variable").str.extract(r"Q[0-9]_(.*)", 1) + ) + .drop("variable") + .pivot(values="value", index=["id", "quarter"], columns="metric") +) +``` + +## Performance Considerations + +### Join Performance + +```python +# 1. Join on indexed/sorted columns when possible +df1_sorted = df1.sort("id") +df2_sorted = df2.sort("id") +result = df1_sorted.join(df2_sorted, on="id") + +# 2. Use appropriate join type +# semi/anti are faster than inner+filter +matches = df1.join(df2, on="id", how="semi") # Better than filtering after inner join + +# 3. Filter before joining +df1_filtered = df1.filter(pl.col("active")) +result = df1_filtered.join(df2, on="id") # Smaller join +``` + +### Concatenation Performance + +```python +# 1. Rechunk after concatenation +result = pl.concat(dfs, rechunk=True) + +# 2. Use lazy mode for large concatenations +lf1 = pl.scan_parquet("file1.parquet") +lf2 = pl.scan_parquet("file2.parquet") +result = pl.concat([lf1, lf2]).collect() +``` + +### Pivot Performance + +```python +# 1. Filter before pivoting +pivoted = df.filter(pl.col("year") == 2023).pivot(...) + +# 2. Specify aggregate function explicitly +pivoted = df.pivot(..., aggregate_function="first") # Faster than "sum" if only one value +``` + +## Common Use Cases + +### Time Series Alignment + +```python +# Align two time series with different timestamps +ts1.join_asof(ts2, on="timestamp", strategy="backward") +``` + +### Feature Engineering + +```python +# Create lag features +df.with_columns( + pl.col("value").shift(1).over("user_id").alias("prev_value"), + pl.col("value").shift(2).over("user_id").alias("prev_prev_value") +) +``` + +### Data Denormalization + +```python +# Combine normalized tables +orders.join(customers, on="customer_id").join(products, on="product_id") +``` + +### Report Generation + +```python +# Pivot for reporting +sales.pivot(values="amount", index="month", columns="product") +``` diff --git a/components/skills/frontend-testing-playwright-dev/API_REFERENCE.md b/components/skills/frontend-testing-playwright-dev/API_REFERENCE.md new file mode 100644 index 0000000..9ee2975 --- /dev/null +++ b/components/skills/frontend-testing-playwright-dev/API_REFERENCE.md @@ -0,0 +1,653 @@ +# Playwright Skill - Complete API Reference + +This document contains the comprehensive Playwright API reference and advanced patterns. For quick-start execution patterns, see [SKILL.md](SKILL.md). + +## Table of Contents + +- [Installation & Setup](#installation--setup) +- [Core Patterns](#core-patterns) +- [Selectors & Locators](#selectors--locators) +- [Common Actions](#common-actions) +- [Waiting Strategies](#waiting-strategies) +- [Assertions](#assertions) +- [Page Object Model](#page-object-model-pom) +- [Network & API Testing](#network--api-testing) +- [Authentication & Session Management](#authentication--session-management) +- [Visual Testing](#visual-testing) +- [Mobile Testing](#mobile-testing) +- [Debugging](#debugging) +- [Performance Testing](#performance-testing) +- [Parallel Execution](#parallel-execution) +- [Data-Driven Testing](#data-driven-testing) +- [Accessibility Testing](#accessibility-testing) +- [CI/CD Integration](#cicd-integration) +- [Best Practices](#best-practices) +- [Common Patterns & Solutions](#common-patterns--solutions) +- [Troubleshooting](#troubleshooting) + +## Installation & Setup + +### Prerequisites + +Before using this skill, ensure Playwright is available: + +```bash +# Check if Playwright is installed +npm list playwright 2>/dev/null || echo "Playwright not installed" + +# Install (if needed) +cd ~/.claude/skills/playwright-skill +npm run setup +``` + +### Basic Configuration + +Create `playwright.config.ts`: + +```typescript +import { defineConfig, devices } from '@playwright/test'; + +export default defineConfig({ + testDir: './tests', + fullyParallel: true, + forbidOnly: !!process.env.CI, + retries: process.env.CI ? 2 : 0, + workers: process.env.CI ? 1 : undefined, + reporter: 'html', + use: { + baseURL: 'http://localhost:3000', + trace: 'on-first-retry', + screenshot: 'only-on-failure', + video: 'retain-on-failure', + }, + projects: [ + { + name: 'chromium', + use: { ...devices['Desktop Chrome'] }, + }, + ], + webServer: { + command: 'npm run start', + url: 'http://localhost:3000', + reuseExistingServer: !process.env.CI, + }, +}); +``` + +## Core Patterns + +### Basic Browser Automation + +```javascript +const { chromium } = require('playwright'); + +(async () => { + // Launch browser + const browser = await chromium.launch({ + headless: false, // Set to true for headless mode + slowMo: 50 // Slow down operations by 50ms + }); + + const context = await browser.newContext({ + viewport: { width: 1280, height: 720 }, + userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }); + + const page = await context.newPage(); + + // Navigate + await page.goto('https://example.com', { + waitUntil: 'networkidle' // Wait for network to be idle + }); + + // Your automation here + + await browser.close(); +})(); +``` + +### Test Structure + +```typescript +import { test, expect } from '@playwright/test'; + +test.describe('Feature Name', () => { + test.beforeEach(async ({ page }) => { + await page.goto('/'); + }); + + test('should do something', async ({ page }) => { + // Arrange + const button = page.locator('button[data-testid="submit"]'); + + // Act + await button.click(); + + // Assert + await expect(page).toHaveURL('/success'); + await expect(page.locator('.message')).toHaveText('Success!'); + }); +}); +``` + +## Selectors & Locators + +### Best Practices for Selectors + +```javascript +// PREFERRED: Data attributes (most stable) +await page.locator('[data-testid="submit-button"]').click(); +await page.locator('[data-cy="user-input"]').fill('text'); + +// GOOD: Role-based selectors (accessible) +await page.getByRole('button', { name: 'Submit' }).click(); +await page.getByRole('textbox', { name: 'Email' }).fill('user@example.com'); +await page.getByRole('heading', { level: 1 }).click(); + +// GOOD: Text content (for unique text) +await page.getByText('Sign in').click(); +await page.getByText(/welcome back/i).click(); + +// OK: Semantic HTML +await page.locator('button[type="submit"]').click(); +await page.locator('input[name="email"]').fill('test@test.com'); + +// AVOID: Classes and IDs (can change frequently) +await page.locator('.btn-primary').click(); // Avoid +await page.locator('#submit').click(); // Avoid + +// LAST RESORT: Complex CSS/XPath +await page.locator('div.container > form > button').click(); // Fragile +``` + +### Advanced Locator Patterns + +```javascript +// Filter and chain locators +const row = page.locator('tr').filter({ hasText: 'John Doe' }); +await row.locator('button').click(); + +// Nth element +await page.locator('button').nth(2).click(); + +// Combining conditions +await page.locator('button').and(page.locator('[disabled]')).count(); + +// Parent/child navigation +const cell = page.locator('td').filter({ hasText: 'Active' }); +const row = cell.locator('..'); +await row.locator('button.edit').click(); +``` + +## Common Actions + +### Form Interactions + +```javascript +// Text input +await page.getByLabel('Email').fill('user@example.com'); +await page.getByPlaceholder('Enter your name').fill('John Doe'); + +// Clear and type +await page.locator('#username').clear(); +await page.locator('#username').type('newuser', { delay: 100 }); + +// Checkbox +await page.getByLabel('I agree').check(); +await page.getByLabel('Subscribe').uncheck(); + +// Radio button +await page.getByLabel('Option 2').check(); + +// Select dropdown +await page.selectOption('select#country', 'usa'); +await page.selectOption('select#country', { label: 'United States' }); +await page.selectOption('select#country', { index: 2 }); + +// Multi-select +await page.selectOption('select#colors', ['red', 'blue', 'green']); + +// File upload +await page.setInputFiles('input[type="file"]', 'path/to/file.pdf'); +await page.setInputFiles('input[type="file"]', [ + 'file1.pdf', + 'file2.pdf' +]); +``` + +### Mouse Actions + +```javascript +// Click variations +await page.click('button'); // Left click +await page.click('button', { button: 'right' }); // Right click +await page.dblclick('button'); // Double click +await page.click('button', { position: { x: 10, y: 10 } }); // Click at position + +// Hover +await page.hover('.menu-item'); + +// Drag and drop +await page.dragAndDrop('#source', '#target'); + +// Manual drag +await page.locator('#source').hover(); +await page.mouse.down(); +await page.locator('#target').hover(); +await page.mouse.up(); +``` + +### Keyboard Actions + +```javascript +// Type with delay +await page.keyboard.type('Hello World', { delay: 100 }); + +// Key combinations +await page.keyboard.press('Control+A'); +await page.keyboard.press('Control+C'); +await page.keyboard.press('Control+V'); + +// Special keys +await page.keyboard.press('Enter'); +await page.keyboard.press('Tab'); +await page.keyboard.press('Escape'); +await page.keyboard.press('ArrowDown'); +``` + +## Waiting Strategies + +### Smart Waiting + +```javascript +// Wait for element states +await page.locator('button').waitFor({ state: 'visible' }); +await page.locator('.spinner').waitFor({ state: 'hidden' }); +await page.locator('button').waitFor({ state: 'attached' }); +await page.locator('button').waitFor({ state: 'detached' }); + +// Wait for specific conditions +await page.waitForURL('**/success'); +await page.waitForURL(url => url.pathname === '/dashboard'); + +// Wait for network +await page.waitForLoadState('networkidle'); +await page.waitForLoadState('domcontentloaded'); + +// Wait for function +await page.waitForFunction(() => document.querySelector('.loaded')); +await page.waitForFunction( + text => document.body.innerText.includes(text), + 'Content loaded' +); + +// Wait for response +const responsePromise = page.waitForResponse('**/api/users'); +await page.click('button#load-users'); +const response = await responsePromise; + +// Wait for request +await page.waitForRequest(request => + request.url().includes('/api/') && request.method() === 'POST' +); + +// Custom timeout +await page.locator('.slow-element').waitFor({ + state: 'visible', + timeout: 10000 // 10 seconds +}); +``` + +## Assertions + +### Common Assertions + +```javascript +import { expect } from '@playwright/test'; + +// Page assertions +await expect(page).toHaveTitle('My App'); +await expect(page).toHaveURL('https://example.com/dashboard'); +await expect(page).toHaveURL(/.*dashboard/); + +// Element visibility +await expect(page.locator('.message')).toBeVisible(); +await expect(page.locator('.spinner')).toBeHidden(); +await expect(page.locator('button')).toBeEnabled(); +await expect(page.locator('input')).toBeDisabled(); + +// Text content +await expect(page.locator('h1')).toHaveText('Welcome'); +await expect(page.locator('.message')).toContainText('success'); +await expect(page.locator('.items')).toHaveText(['Item 1', 'Item 2']); + +// Input values +await expect(page.locator('input')).toHaveValue('test@example.com'); +await expect(page.locator('input')).toBeEmpty(); + +// Attributes +await expect(page.locator('button')).toHaveAttribute('type', 'submit'); +await expect(page.locator('img')).toHaveAttribute('src', /.*\.png/); + +// CSS properties +await expect(page.locator('.error')).toHaveCSS('color', 'rgb(255, 0, 0)'); + +// Count +await expect(page.locator('.item')).toHaveCount(5); + +// Checkbox/Radio state +await expect(page.locator('input[type="checkbox"]')).toBeChecked(); +``` + +## Page Object Model (POM) + +### Basic Page Object + +```javascript +// pages/LoginPage.js +class LoginPage { + constructor(page) { + this.page = page; + this.usernameInput = page.locator('input[name="username"]'); + this.passwordInput = page.locator('input[name="password"]'); + this.submitButton = page.locator('button[type="submit"]'); + this.errorMessage = page.locator('.error-message'); + } + + async navigate() { + await this.page.goto('/login'); + } + + async login(username, password) { + await this.usernameInput.fill(username); + await this.passwordInput.fill(password); + await this.submitButton.click(); + } + + async getErrorMessage() { + return await this.errorMessage.textContent(); + } +} + +// Usage in test +test('login with valid credentials', async ({ page }) => { + const loginPage = new LoginPage(page); + await loginPage.navigate(); + await loginPage.login('user@example.com', 'password123'); + await expect(page).toHaveURL('/dashboard'); +}); +``` + +## Network & API Testing + +### Intercepting Requests + +```javascript +// Mock API responses +await page.route('**/api/users', route => { + route.fulfill({ + status: 200, + contentType: 'application/json', + body: JSON.stringify([ + { id: 1, name: 'John' }, + { id: 2, name: 'Jane' } + ]) + }); +}); + +// Modify requests +await page.route('**/api/**', route => { + const headers = { + ...route.request().headers(), + 'X-Custom-Header': 'value' + }; + route.continue({ headers }); +}); + +// Block resources +await page.route('**/*.{png,jpg,jpeg,gif}', route => route.abort()); +``` + +### Custom Headers via Environment Variables + +The skill supports automatic header injection via environment variables: + +```bash +# Single header (simple) +PW_HEADER_NAME=X-Automated-By PW_HEADER_VALUE=playwright-skill + +# Multiple headers (JSON) +PW_EXTRA_HEADERS='{"X-Automated-By":"playwright-skill","X-Request-ID":"123"}' +``` + +These headers are automatically applied to all requests when using: +- `helpers.createContext(browser)` - headers merged automatically +- `getContextOptionsWithHeaders(options)` - utility injected by run.js wrapper + +**Precedence (highest to lowest):** +1. Headers passed directly in `options.extraHTTPHeaders` +2. Environment variable headers +3. Playwright defaults + +**Use case:** Identify automated traffic so your backend can return LLM-optimized responses (e.g., plain text errors instead of styled HTML). + +## Visual Testing + +### Screenshots + +```javascript +// Full page screenshot +await page.screenshot({ + path: 'screenshot.png', + fullPage: true +}); + +// Element screenshot +await page.locator('.chart').screenshot({ + path: 'chart.png' +}); + +// Visual comparison +await expect(page).toHaveScreenshot('homepage.png'); +``` + +## Mobile Testing + +```javascript +// Device emulation +const { devices } = require('playwright'); +const iPhone = devices['iPhone 12']; + +const context = await browser.newContext({ + ...iPhone, + locale: 'en-US', + permissions: ['geolocation'], + geolocation: { latitude: 37.7749, longitude: -122.4194 } +}); +``` + +## Debugging + +### Debug Mode + +```bash +# Run with inspector +npx playwright test --debug + +# Headed mode +npx playwright test --headed + +# Slow motion +npx playwright test --headed --slowmo=1000 +``` + +### In-Code Debugging + +```javascript +// Pause execution +await page.pause(); + +// Console logs +page.on('console', msg => console.log('Browser log:', msg.text())); +page.on('pageerror', error => console.log('Page error:', error)); +``` + +## Performance Testing + +```javascript +// Measure page load time +const startTime = Date.now(); +await page.goto('https://example.com'); +const loadTime = Date.now() - startTime; +console.log(`Page loaded in ${loadTime}ms`); +``` + +## Parallel Execution + +```javascript +// Run tests in parallel +test.describe.parallel('Parallel suite', () => { + test('test 1', async ({ page }) => { + // Runs in parallel with test 2 + }); + + test('test 2', async ({ page }) => { + // Runs in parallel with test 1 + }); +}); +``` + +## Data-Driven Testing + +```javascript +// Parameterized tests +const testData = [ + { username: 'user1', password: 'pass1', expected: 'Welcome user1' }, + { username: 'user2', password: 'pass2', expected: 'Welcome user2' }, +]; + +testData.forEach(({ username, password, expected }) => { + test(`login with ${username}`, async ({ page }) => { + await page.goto('/login'); + await page.fill('#username', username); + await page.fill('#password', password); + await page.click('button[type="submit"]'); + await expect(page.locator('.message')).toHaveText(expected); + }); +}); +``` + +## Accessibility Testing + +```javascript +import { injectAxe, checkA11y } from 'axe-playwright'; + +test('accessibility check', async ({ page }) => { + await page.goto('/'); + await injectAxe(page); + await checkA11y(page); +}); +``` + +## CI/CD Integration + +### GitHub Actions + +```yaml +name: Playwright Tests +on: + push: + branches: [main, master] +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-node@v3 + - name: Install dependencies + run: npm ci + - name: Install Playwright Browsers + run: npx playwright install --with-deps + - name: Run tests + run: npx playwright test +``` + +## Best Practices + +1. **Test Organization** - Use descriptive test names, group related tests +2. **Selector Strategy** - Prefer data-testid attributes, use role-based selectors +3. **Waiting** - Use Playwright's auto-waiting, avoid hard-coded delays +4. **Error Handling** - Add proper error messages, take screenshots on failure +5. **Performance** - Run tests in parallel, reuse authentication state + +## Common Patterns & Solutions + +### Handling Popups + +```javascript +const [popup] = await Promise.all([ + page.waitForEvent('popup'), + page.click('button.open-popup') +]); +await popup.waitForLoadState(); +``` + +### File Downloads + +```javascript +const [download] = await Promise.all([ + page.waitForEvent('download'), + page.click('button.download') +]); +await download.saveAs(`./downloads/${download.suggestedFilename()}`); +``` + +### iFrames + +```javascript +const frame = page.frameLocator('#my-iframe'); +await frame.locator('button').click(); +``` + +### Infinite Scroll + +```javascript +async function scrollToBottom(page) { + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await page.waitForTimeout(500); +} +``` + +## Troubleshooting + +### Common Issues + +1. **Element not found** - Check if element is in iframe, verify visibility +2. **Timeout errors** - Increase timeout, check network conditions +3. **Flaky tests** - Use proper waiting strategies, mock external dependencies +4. **Authentication issues** - Verify auth state is properly saved + +## Quick Reference Commands + +```bash +# Run tests +npx playwright test + +# Run in headed mode +npx playwright test --headed + +# Debug tests +npx playwright test --debug + +# Generate code +npx playwright codegen https://example.com + +# Show report +npx playwright show-report +``` + +## Additional Resources + +- [Playwright Documentation](https://playwright.dev/docs/intro) +- [API Reference](https://playwright.dev/docs/api/class-playwright) +- [Best Practices](https://playwright.dev/docs/best-practices) diff --git a/components/skills/frontend-testing-playwright-dev/SKILL.md b/components/skills/frontend-testing-playwright-dev/SKILL.md new file mode 100644 index 0000000..98c8214 --- /dev/null +++ b/components/skills/frontend-testing-playwright-dev/SKILL.md @@ -0,0 +1,453 @@ +--- +name: playwright-skill +description: Complete browser automation with Playwright. Auto-detects dev servers, writes clean test scripts to /tmp. Test pages, fill forms, take screenshots, check responsive design, validate UX, test login flows, check links, automate any browser task. Use when user wants to test websites, automate browser interactions, validate web functionality, or perform any browser-based testing. +--- + +**IMPORTANT - Path Resolution:** +This skill can be installed in different locations (plugin system, manual installation, global, or project-specific). Before executing any commands, determine the skill directory based on where you loaded this SKILL.md file, and use that path in all commands below. Replace `$SKILL_DIR` with the actual discovered path. + +Common installation paths: + +- Plugin system: `~/.claude/plugins/marketplaces/playwright-skill/skills/playwright-skill` +- Manual global: `~/.claude/skills/playwright-skill` +- Project-specific: `/.claude/skills/playwright-skill` + +# Playwright Browser Automation + +General-purpose browser automation skill. I'll write custom Playwright code for any automation task you request and execute it via the universal executor. + +**CRITICAL WORKFLOW - Follow these steps in order:** + +1. **Auto-detect dev servers** - For localhost testing, ALWAYS run server detection FIRST: + + ```bash + cd $SKILL_DIR && node -e "require('./lib/helpers').detectDevServers().then(servers => console.log(JSON.stringify(servers)))" + ``` + + - If **1 server found**: Use it automatically, inform user + - If **multiple servers found**: Ask user which one to test + - If **no servers found**: Ask for URL or offer to help start dev server + +2. **Write scripts to /tmp** - NEVER write test files to skill directory; always use `/tmp/playwright-test-*.js` + +3. **Use visible browser by default** - Always use `headless: false` unless user specifically requests headless mode + +4. **Parameterize URLs** - Always make URLs configurable via environment variable or constant at top of script + +## How It Works + +1. You describe what you want to test/automate +2. I auto-detect running dev servers (or ask for URL if testing external site) +3. I write custom Playwright code in `/tmp/playwright-test-*.js` (won't clutter your project) +4. I execute it via: `cd $SKILL_DIR && node run.js /tmp/playwright-test-*.js` +5. Results displayed in real-time, browser window visible for debugging +6. Test files auto-cleaned from /tmp by your OS + +## Setup (First Time) + +```bash +cd $SKILL_DIR +npm run setup +``` + +This installs Playwright and Chromium browser. Only needed once. + +## Execution Pattern + +**Step 1: Detect dev servers (for localhost testing)** + +```bash +cd $SKILL_DIR && node -e "require('./lib/helpers').detectDevServers().then(s => console.log(JSON.stringify(s)))" +``` + +**Step 2: Write test script to /tmp with URL parameter** + +```javascript +// /tmp/playwright-test-page.js +const { chromium } = require('playwright'); + +// Parameterized URL (detected or user-provided) +const TARGET_URL = 'http://localhost:3001'; // <-- Auto-detected or from user + +(async () => { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + + await page.goto(TARGET_URL); + console.log('Page loaded:', await page.title()); + + await page.screenshot({ path: '/tmp/screenshot.png', fullPage: true }); + console.log('πŸ“Έ Screenshot saved to /tmp/screenshot.png'); + + await browser.close(); +})(); +``` + +**Step 3: Execute from skill directory** + +```bash +cd $SKILL_DIR && node run.js /tmp/playwright-test-page.js +``` + +## Common Patterns + +### Test a Page (Multiple Viewports) + +```javascript +// /tmp/playwright-test-responsive.js +const { chromium } = require('playwright'); + +const TARGET_URL = 'http://localhost:3001'; // Auto-detected + +(async () => { + const browser = await chromium.launch({ headless: false, slowMo: 100 }); + const page = await browser.newPage(); + + // Desktop test + await page.setViewportSize({ width: 1920, height: 1080 }); + await page.goto(TARGET_URL); + console.log('Desktop - Title:', await page.title()); + await page.screenshot({ path: '/tmp/desktop.png', fullPage: true }); + + // Mobile test + await page.setViewportSize({ width: 375, height: 667 }); + await page.screenshot({ path: '/tmp/mobile.png', fullPage: true }); + + await browser.close(); +})(); +``` + +### Test Login Flow + +```javascript +// /tmp/playwright-test-login.js +const { chromium } = require('playwright'); + +const TARGET_URL = 'http://localhost:3001'; // Auto-detected + +(async () => { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + + await page.goto(`${TARGET_URL}/login`); + + await page.fill('input[name="email"]', 'test@example.com'); + await page.fill('input[name="password"]', 'password123'); + await page.click('button[type="submit"]'); + + // Wait for redirect + await page.waitForURL('**/dashboard'); + console.log('βœ… Login successful, redirected to dashboard'); + + await browser.close(); +})(); +``` + +### Fill and Submit Form + +```javascript +// /tmp/playwright-test-form.js +const { chromium } = require('playwright'); + +const TARGET_URL = 'http://localhost:3001'; // Auto-detected + +(async () => { + const browser = await chromium.launch({ headless: false, slowMo: 50 }); + const page = await browser.newPage(); + + await page.goto(`${TARGET_URL}/contact`); + + await page.fill('input[name="name"]', 'John Doe'); + await page.fill('input[name="email"]', 'john@example.com'); + await page.fill('textarea[name="message"]', 'Test message'); + await page.click('button[type="submit"]'); + + // Verify submission + await page.waitForSelector('.success-message'); + console.log('βœ… Form submitted successfully'); + + await browser.close(); +})(); +``` + +### Check for Broken Links + +```javascript +const { chromium } = require('playwright'); + +(async () => { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + + await page.goto('http://localhost:3000'); + + const links = await page.locator('a[href^="http"]').all(); + const results = { working: 0, broken: [] }; + + for (const link of links) { + const href = await link.getAttribute('href'); + try { + const response = await page.request.head(href); + if (response.ok()) { + results.working++; + } else { + results.broken.push({ url: href, status: response.status() }); + } + } catch (e) { + results.broken.push({ url: href, error: e.message }); + } + } + + console.log(`βœ… Working links: ${results.working}`); + console.log(`❌ Broken links:`, results.broken); + + await browser.close(); +})(); +``` + +### Take Screenshot with Error Handling + +```javascript +const { chromium } = require('playwright'); + +(async () => { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + + try { + await page.goto('http://localhost:3000', { + waitUntil: 'networkidle', + timeout: 10000, + }); + + await page.screenshot({ + path: '/tmp/screenshot.png', + fullPage: true, + }); + + console.log('πŸ“Έ Screenshot saved to /tmp/screenshot.png'); + } catch (error) { + console.error('❌ Error:', error.message); + } finally { + await browser.close(); + } +})(); +``` + +### Test Responsive Design + +```javascript +// /tmp/playwright-test-responsive-full.js +const { chromium } = require('playwright'); + +const TARGET_URL = 'http://localhost:3001'; // Auto-detected + +(async () => { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + + const viewports = [ + { name: 'Desktop', width: 1920, height: 1080 }, + { name: 'Tablet', width: 768, height: 1024 }, + { name: 'Mobile', width: 375, height: 667 }, + ]; + + for (const viewport of viewports) { + console.log( + `Testing ${viewport.name} (${viewport.width}x${viewport.height})`, + ); + + await page.setViewportSize({ + width: viewport.width, + height: viewport.height, + }); + + await page.goto(TARGET_URL); + await page.waitForTimeout(1000); + + await page.screenshot({ + path: `/tmp/${viewport.name.toLowerCase()}.png`, + fullPage: true, + }); + } + + console.log('βœ… All viewports tested'); + await browser.close(); +})(); +``` + +## Inline Execution (Simple Tasks) + +For quick one-off tasks, you can execute code inline without creating files: + +```bash +# Take a quick screenshot +cd $SKILL_DIR && node run.js " +const browser = await chromium.launch({ headless: false }); +const page = await browser.newPage(); +await page.goto('http://localhost:3001'); +await page.screenshot({ path: '/tmp/quick-screenshot.png', fullPage: true }); +console.log('Screenshot saved'); +await browser.close(); +" +``` + +**When to use inline vs files:** + +- **Inline**: Quick one-off tasks (screenshot, check if element exists, get page title) +- **Files**: Complex tests, responsive design checks, anything user might want to re-run + +## Available Helpers + +Optional utility functions in `lib/helpers.js`: + +```javascript +const helpers = require('./lib/helpers'); + +// Detect running dev servers (CRITICAL - use this first!) +const servers = await helpers.detectDevServers(); +console.log('Found servers:', servers); + +// Safe click with retry +await helpers.safeClick(page, 'button.submit', { retries: 3 }); + +// Safe type with clear +await helpers.safeType(page, '#username', 'testuser'); + +// Take timestamped screenshot +await helpers.takeScreenshot(page, 'test-result'); + +// Handle cookie banners +await helpers.handleCookieBanner(page); + +// Extract table data +const data = await helpers.extractTableData(page, 'table.results'); +``` + +See `lib/helpers.js` for full list. + +## Custom HTTP Headers + +Configure custom headers for all HTTP requests via environment variables. Useful for: + +- Identifying automated traffic to your backend +- Getting LLM-optimized responses (e.g., plain text errors instead of styled HTML) +- Adding authentication tokens globally + +### Configuration + +**Single header (common case):** + +```bash +PW_HEADER_NAME=X-Automated-By PW_HEADER_VALUE=playwright-skill \ + cd $SKILL_DIR && node run.js /tmp/my-script.js +``` + +**Multiple headers (JSON format):** + +```bash +PW_EXTRA_HEADERS='{"X-Automated-By":"playwright-skill","X-Debug":"true"}' \ + cd $SKILL_DIR && node run.js /tmp/my-script.js +``` + +### How It Works + +Headers are automatically applied when using `helpers.createContext()`: + +```javascript +const context = await helpers.createContext(browser); +const page = await context.newPage(); +// All requests from this page include your custom headers +``` + +For scripts using raw Playwright API, use the injected `getContextOptionsWithHeaders()`: + +```javascript +const context = await browser.newContext( + getContextOptionsWithHeaders({ viewport: { width: 1920, height: 1080 } }), +); +``` + +## Advanced Usage + +For comprehensive Playwright API documentation, see [API_REFERENCE.md](API_REFERENCE.md): + +- Selectors & Locators best practices +- Network interception & API mocking +- Authentication & session management +- Visual regression testing +- Mobile device emulation +- Performance testing +- Debugging techniques +- CI/CD integration + +## Tips + +- **CRITICAL: Detect servers FIRST** - Always run `detectDevServers()` before writing test code for localhost testing +- **Custom headers** - Use `PW_HEADER_NAME`/`PW_HEADER_VALUE` env vars to identify automated traffic to your backend +- **Use /tmp for test files** - Write to `/tmp/playwright-test-*.js`, never to skill directory or user's project +- **Parameterize URLs** - Put detected/provided URL in a `TARGET_URL` constant at the top of every script +- **DEFAULT: Visible browser** - Always use `headless: false` unless user explicitly asks for headless mode +- **Headless mode** - Only use `headless: true` when user specifically requests "headless" or "background" execution +- **Slow down:** Use `slowMo: 100` to make actions visible and easier to follow +- **Wait strategies:** Use `waitForURL`, `waitForSelector`, `waitForLoadState` instead of fixed timeouts +- **Error handling:** Always use try-catch for robust automation +- **Console output:** Use `console.log()` to track progress and show what's happening + +## Troubleshooting + +**Playwright not installed:** + +```bash +cd $SKILL_DIR && npm run setup +``` + +**Module not found:** +Ensure running from skill directory via `run.js` wrapper + +**Browser doesn't open:** +Check `headless: false` and ensure display available + +**Element not found:** +Add wait: `await page.waitForSelector('.element', { timeout: 10000 })` + +## Example Usage + +``` +User: "Test if the marketing page looks good" + +Claude: I'll test the marketing page across multiple viewports. Let me first detect running servers... +[Runs: detectDevServers()] +[Output: Found server on port 3001] +I found your dev server running on http://localhost:3001 + +[Writes custom automation script to /tmp/playwright-test-marketing.js with URL parameterized] +[Runs: cd $SKILL_DIR && node run.js /tmp/playwright-test-marketing.js] +[Shows results with screenshots from /tmp/] +``` + +``` +User: "Check if login redirects correctly" + +Claude: I'll test the login flow. First, let me check for running servers... +[Runs: detectDevServers()] +[Output: Found servers on ports 3000 and 3001] +I found 2 dev servers. Which one should I test? +- http://localhost:3000 +- http://localhost:3001 + +User: "Use 3001" + +[Writes login automation to /tmp/playwright-test-login.js] +[Runs: cd $SKILL_DIR && node run.js /tmp/playwright-test-login.js] +[Reports: βœ… Login successful, redirected to /dashboard] +``` + +## Notes + +- Each automation is custom-written for your specific request +- Not limited to pre-built scripts - any browser task possible +- Auto-detects running dev servers to eliminate hardcoded URLs +- Test scripts written to `/tmp` for automatic cleanup (no clutter) +- Code executes reliably with proper module resolution via `run.js` +- Progressive disclosure - API_REFERENCE.md loaded only when advanced features needed diff --git a/components/skills/frontend-testing-playwright-dev/lib/helpers.js b/components/skills/frontend-testing-playwright-dev/lib/helpers.js new file mode 100644 index 0000000..0920d68 --- /dev/null +++ b/components/skills/frontend-testing-playwright-dev/lib/helpers.js @@ -0,0 +1,441 @@ +// playwright-helpers.js +// Reusable utility functions for Playwright automation + +const { chromium, firefox, webkit } = require('playwright'); + +/** + * Parse extra HTTP headers from environment variables. + * Supports two formats: + * - PW_HEADER_NAME + PW_HEADER_VALUE: Single header (simple, common case) + * - PW_EXTRA_HEADERS: JSON object for multiple headers (advanced) + * Single header format takes precedence if both are set. + * @returns {Object|null} Headers object or null if none configured + */ +function getExtraHeadersFromEnv() { + const headerName = process.env.PW_HEADER_NAME; + const headerValue = process.env.PW_HEADER_VALUE; + + if (headerName && headerValue) { + return { [headerName]: headerValue }; + } + + const headersJson = process.env.PW_EXTRA_HEADERS; + if (headersJson) { + try { + const parsed = JSON.parse(headersJson); + if (typeof parsed === 'object' && parsed !== null && !Array.isArray(parsed)) { + return parsed; + } + console.warn('PW_EXTRA_HEADERS must be a JSON object, ignoring...'); + } catch (e) { + console.warn('Failed to parse PW_EXTRA_HEADERS as JSON:', e.message); + } + } + + return null; +} + +/** + * Launch browser with standard configuration + * @param {string} browserType - 'chromium', 'firefox', or 'webkit' + * @param {Object} options - Additional launch options + */ +async function launchBrowser(browserType = 'chromium', options = {}) { + const defaultOptions = { + headless: process.env.HEADLESS !== 'false', + slowMo: process.env.SLOW_MO ? parseInt(process.env.SLOW_MO) : 0, + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }; + + const browsers = { chromium, firefox, webkit }; + const browser = browsers[browserType]; + + if (!browser) { + throw new Error(`Invalid browser type: ${browserType}`); + } + + return await browser.launch({ ...defaultOptions, ...options }); +} + +/** + * Create a new page with viewport and user agent + * @param {Object} context - Browser context + * @param {Object} options - Page options + */ +async function createPage(context, options = {}) { + const page = await context.newPage(); + + if (options.viewport) { + await page.setViewportSize(options.viewport); + } + + if (options.userAgent) { + await page.setExtraHTTPHeaders({ + 'User-Agent': options.userAgent + }); + } + + // Set default timeout + page.setDefaultTimeout(options.timeout || 30000); + + return page; +} + +/** + * Smart wait for page to be ready + * @param {Object} page - Playwright page + * @param {Object} options - Wait options + */ +async function waitForPageReady(page, options = {}) { + const waitOptions = { + waitUntil: options.waitUntil || 'networkidle', + timeout: options.timeout || 30000 + }; + + try { + await page.waitForLoadState(waitOptions.waitUntil, { + timeout: waitOptions.timeout + }); + } catch (e) { + console.warn('Page load timeout, continuing...'); + } + + // Additional wait for dynamic content if selector provided + if (options.waitForSelector) { + await page.waitForSelector(options.waitForSelector, { + timeout: options.timeout + }); + } +} + +/** + * Safe click with retry logic + * @param {Object} page - Playwright page + * @param {string} selector - Element selector + * @param {Object} options - Click options + */ +async function safeClick(page, selector, options = {}) { + const maxRetries = options.retries || 3; + const retryDelay = options.retryDelay || 1000; + + for (let i = 0; i < maxRetries; i++) { + try { + await page.waitForSelector(selector, { + state: 'visible', + timeout: options.timeout || 5000 + }); + await page.click(selector, { + force: options.force || false, + timeout: options.timeout || 5000 + }); + return true; + } catch (e) { + if (i === maxRetries - 1) { + console.error(`Failed to click ${selector} after ${maxRetries} attempts`); + throw e; + } + console.log(`Retry ${i + 1}/${maxRetries} for clicking ${selector}`); + await page.waitForTimeout(retryDelay); + } + } +} + +/** + * Safe text input with clear before type + * @param {Object} page - Playwright page + * @param {string} selector - Input selector + * @param {string} text - Text to type + * @param {Object} options - Type options + */ +async function safeType(page, selector, text, options = {}) { + await page.waitForSelector(selector, { + state: 'visible', + timeout: options.timeout || 10000 + }); + + if (options.clear !== false) { + await page.fill(selector, ''); + } + + if (options.slow) { + await page.type(selector, text, { delay: options.delay || 100 }); + } else { + await page.fill(selector, text); + } +} + +/** + * Extract text from multiple elements + * @param {Object} page - Playwright page + * @param {string} selector - Elements selector + */ +async function extractTexts(page, selector) { + await page.waitForSelector(selector, { timeout: 10000 }); + return await page.$$eval(selector, elements => + elements.map(el => el.textContent?.trim()).filter(Boolean) + ); +} + +/** + * Take screenshot with timestamp + * @param {Object} page - Playwright page + * @param {string} name - Screenshot name + * @param {Object} options - Screenshot options + */ +async function takeScreenshot(page, name, options = {}) { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const filename = `${name}-${timestamp}.png`; + + await page.screenshot({ + path: filename, + fullPage: options.fullPage !== false, + ...options + }); + + console.log(`Screenshot saved: ${filename}`); + return filename; +} + +/** + * Handle authentication + * @param {Object} page - Playwright page + * @param {Object} credentials - Username and password + * @param {Object} selectors - Login form selectors + */ +async function authenticate(page, credentials, selectors = {}) { + const defaultSelectors = { + username: 'input[name="username"], input[name="email"], #username, #email', + password: 'input[name="password"], #password', + submit: 'button[type="submit"], input[type="submit"], button:has-text("Login"), button:has-text("Sign in")' + }; + + const finalSelectors = { ...defaultSelectors, ...selectors }; + + await safeType(page, finalSelectors.username, credentials.username); + await safeType(page, finalSelectors.password, credentials.password); + await safeClick(page, finalSelectors.submit); + + // Wait for navigation or success indicator + await Promise.race([ + page.waitForNavigation({ waitUntil: 'networkidle' }), + page.waitForSelector(selectors.successIndicator || '.dashboard, .user-menu, .logout', { timeout: 10000 }) + ]).catch(() => { + console.log('Login might have completed without navigation'); + }); +} + +/** + * Scroll page + * @param {Object} page - Playwright page + * @param {string} direction - 'down', 'up', 'top', 'bottom' + * @param {number} distance - Pixels to scroll (for up/down) + */ +async function scrollPage(page, direction = 'down', distance = 500) { + switch (direction) { + case 'down': + await page.evaluate(d => window.scrollBy(0, d), distance); + break; + case 'up': + await page.evaluate(d => window.scrollBy(0, -d), distance); + break; + case 'top': + await page.evaluate(() => window.scrollTo(0, 0)); + break; + case 'bottom': + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + break; + } + await page.waitForTimeout(500); // Wait for scroll animation +} + +/** + * Extract table data + * @param {Object} page - Playwright page + * @param {string} tableSelector - Table selector + */ +async function extractTableData(page, tableSelector) { + await page.waitForSelector(tableSelector); + + return await page.evaluate((selector) => { + const table = document.querySelector(selector); + if (!table) return null; + + const headers = Array.from(table.querySelectorAll('thead th')).map(th => + th.textContent?.trim() + ); + + const rows = Array.from(table.querySelectorAll('tbody tr')).map(tr => { + const cells = Array.from(tr.querySelectorAll('td')); + if (headers.length > 0) { + return cells.reduce((obj, cell, index) => { + obj[headers[index] || `column_${index}`] = cell.textContent?.trim(); + return obj; + }, {}); + } else { + return cells.map(cell => cell.textContent?.trim()); + } + }); + + return { headers, rows }; + }, tableSelector); +} + +/** + * Wait for and dismiss cookie banners + * @param {Object} page - Playwright page + * @param {number} timeout - Max time to wait + */ +async function handleCookieBanner(page, timeout = 3000) { + const commonSelectors = [ + 'button:has-text("Accept")', + 'button:has-text("Accept all")', + 'button:has-text("OK")', + 'button:has-text("Got it")', + 'button:has-text("I agree")', + '.cookie-accept', + '#cookie-accept', + '[data-testid="cookie-accept"]' + ]; + + for (const selector of commonSelectors) { + try { + const element = await page.waitForSelector(selector, { + timeout: timeout / commonSelectors.length, + state: 'visible' + }); + if (element) { + await element.click(); + console.log('Cookie banner dismissed'); + return true; + } + } catch (e) { + // Continue to next selector + } + } + + return false; +} + +/** + * Retry a function with exponential backoff + * @param {Function} fn - Function to retry + * @param {number} maxRetries - Maximum retry attempts + * @param {number} initialDelay - Initial delay in ms + */ +async function retryWithBackoff(fn, maxRetries = 3, initialDelay = 1000) { + let lastError; + + for (let i = 0; i < maxRetries; i++) { + try { + return await fn(); + } catch (error) { + lastError = error; + const delay = initialDelay * Math.pow(2, i); + console.log(`Attempt ${i + 1} failed, retrying in ${delay}ms...`); + await new Promise(resolve => setTimeout(resolve, delay)); + } + } + + throw lastError; +} + +/** + * Create browser context with common settings + * @param {Object} browser - Browser instance + * @param {Object} options - Context options + */ +async function createContext(browser, options = {}) { + const envHeaders = getExtraHeadersFromEnv(); + + // Merge environment headers with any passed in options + const mergedHeaders = { + ...envHeaders, + ...options.extraHTTPHeaders + }; + + const defaultOptions = { + viewport: { width: 1280, height: 720 }, + userAgent: options.mobile + ? 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1' + : undefined, + permissions: options.permissions || [], + geolocation: options.geolocation, + locale: options.locale || 'en-US', + timezoneId: options.timezoneId || 'America/New_York', + // Only include extraHTTPHeaders if we have any + ...(Object.keys(mergedHeaders).length > 0 && { extraHTTPHeaders: mergedHeaders }) + }; + + return await browser.newContext({ ...defaultOptions, ...options }); +} + +/** + * Detect running dev servers on common ports + * @param {Array} customPorts - Additional ports to check + * @returns {Promise} Array of detected server URLs + */ +async function detectDevServers(customPorts = []) { + const http = require('http'); + + // Common dev server ports + const commonPorts = [3000, 3001, 3002, 5173, 8080, 8000, 4200, 5000, 9000, 1234]; + const allPorts = [...new Set([...commonPorts, ...customPorts])]; + + const detectedServers = []; + + console.log('πŸ” Checking for running dev servers...'); + + for (const port of allPorts) { + try { + await new Promise((resolve, reject) => { + const req = http.request({ + hostname: 'localhost', + port: port, + path: '/', + method: 'HEAD', + timeout: 500 + }, (res) => { + if (res.statusCode < 500) { + detectedServers.push(`http://localhost:${port}`); + console.log(` βœ… Found server on port ${port}`); + } + resolve(); + }); + + req.on('error', () => resolve()); + req.on('timeout', () => { + req.destroy(); + resolve(); + }); + + req.end(); + }); + } catch (e) { + // Port not available, continue + } + } + + if (detectedServers.length === 0) { + console.log(' ❌ No dev servers detected'); + } + + return detectedServers; +} + +module.exports = { + launchBrowser, + createPage, + waitForPageReady, + safeClick, + safeType, + extractTexts, + takeScreenshot, + authenticate, + scrollPage, + extractTableData, + handleCookieBanner, + retryWithBackoff, + createContext, + detectDevServers, + getExtraHeadersFromEnv +}; diff --git a/components/skills/frontend-testing-playwright-dev/package.json b/components/skills/frontend-testing-playwright-dev/package.json new file mode 100644 index 0000000..ada6c8b --- /dev/null +++ b/components/skills/frontend-testing-playwright-dev/package.json @@ -0,0 +1,26 @@ +{ + "name": "playwright-skill", + "version": "4.1.0", + "description": "General-purpose browser automation with Playwright for Claude Code with auto-detection and smart test management", + "author": "lackeyjb", + "main": "run.js", + "scripts": { + "setup": "npm install && npx playwright install chromium", + "install-all-browsers": "npx playwright install chromium firefox webkit" + }, + "keywords": [ + "playwright", + "automation", + "browser-testing", + "web-automation", + "claude-skill", + "general-purpose" + ], + "dependencies": { + "playwright": "^1.57.0" + }, + "engines": { + "node": ">=14.0.0" + }, + "license": "MIT" +} diff --git a/components/skills/frontend-testing-playwright-dev/run.js b/components/skills/frontend-testing-playwright-dev/run.js new file mode 100755 index 0000000..10f2616 --- /dev/null +++ b/components/skills/frontend-testing-playwright-dev/run.js @@ -0,0 +1,228 @@ +#!/usr/bin/env node +/** + * Universal Playwright Executor for Claude Code + * + * Executes Playwright automation code from: + * - File path: node run.js script.js + * - Inline code: node run.js 'await page.goto("...")' + * - Stdin: cat script.js | node run.js + * + * Ensures proper module resolution by running from skill directory. + */ + +const fs = require('fs'); +const path = require('path'); +const { execSync } = require('child_process'); + +// Change to skill directory for proper module resolution +process.chdir(__dirname); + +/** + * Check if Playwright is installed + */ +function checkPlaywrightInstalled() { + try { + require.resolve('playwright'); + return true; + } catch (e) { + return false; + } +} + +/** + * Install Playwright if missing + */ +function installPlaywright() { + console.log('πŸ“¦ Playwright not found. Installing...'); + try { + execSync('npm install', { stdio: 'inherit', cwd: __dirname }); + execSync('npx playwright install chromium', { stdio: 'inherit', cwd: __dirname }); + console.log('βœ… Playwright installed successfully'); + return true; + } catch (e) { + console.error('❌ Failed to install Playwright:', e.message); + console.error('Please run manually: cd', __dirname, '&& npm run setup'); + return false; + } +} + +/** + * Get code to execute from various sources + */ +function getCodeToExecute() { + const args = process.argv.slice(2); + + // Case 1: File path provided + if (args.length > 0 && fs.existsSync(args[0])) { + const filePath = path.resolve(args[0]); + console.log(`πŸ“„ Executing file: ${filePath}`); + return fs.readFileSync(filePath, 'utf8'); + } + + // Case 2: Inline code provided as argument + if (args.length > 0) { + console.log('⚑ Executing inline code'); + return args.join(' '); + } + + // Case 3: Code from stdin + if (!process.stdin.isTTY) { + console.log('πŸ“₯ Reading from stdin'); + return fs.readFileSync(0, 'utf8'); + } + + // No input + console.error('❌ No code to execute'); + console.error('Usage:'); + console.error(' node run.js script.js # Execute file'); + console.error(' node run.js "code here" # Execute inline'); + console.error(' cat script.js | node run.js # Execute from stdin'); + process.exit(1); +} + +/** + * Clean up old temporary execution files from previous runs + */ +function cleanupOldTempFiles() { + try { + const files = fs.readdirSync(__dirname); + const tempFiles = files.filter(f => f.startsWith('.temp-execution-') && f.endsWith('.js')); + + if (tempFiles.length > 0) { + tempFiles.forEach(file => { + const filePath = path.join(__dirname, file); + try { + fs.unlinkSync(filePath); + } catch (e) { + // Ignore errors - file might be in use or already deleted + } + }); + } + } catch (e) { + // Ignore directory read errors + } +} + +/** + * Wrap code in async IIFE if not already wrapped + */ +function wrapCodeIfNeeded(code) { + // Check if code already has require() and async structure + const hasRequire = code.includes('require('); + const hasAsyncIIFE = code.includes('(async () => {') || code.includes('(async()=>{'); + + // If it's already a complete script, return as-is + if (hasRequire && hasAsyncIIFE) { + return code; + } + + // If it's just Playwright commands, wrap in full template + if (!hasRequire) { + return ` +const { chromium, firefox, webkit, devices } = require('playwright'); +const helpers = require('./lib/helpers'); + +// Extra headers from environment variables (if configured) +const __extraHeaders = helpers.getExtraHeadersFromEnv(); + +/** + * Utility to merge environment headers into context options. + * Use when creating contexts with raw Playwright API instead of helpers.createContext(). + * @param {Object} options - Context options + * @returns {Object} Options with extraHTTPHeaders merged in + */ +function getContextOptionsWithHeaders(options = {}) { + if (!__extraHeaders) return options; + return { + ...options, + extraHTTPHeaders: { + ...__extraHeaders, + ...(options.extraHTTPHeaders || {}) + } + }; +} + +(async () => { + try { + ${code} + } catch (error) { + console.error('❌ Automation error:', error.message); + if (error.stack) { + console.error(error.stack); + } + process.exit(1); + } +})(); +`; + } + + // If has require but no async wrapper + if (!hasAsyncIIFE) { + return ` +(async () => { + try { + ${code} + } catch (error) { + console.error('❌ Automation error:', error.message); + if (error.stack) { + console.error(error.stack); + } + process.exit(1); + } +})(); +`; + } + + return code; +} + +/** + * Main execution + */ +async function main() { + console.log('🎭 Playwright Skill - Universal Executor\n'); + + // Clean up old temp files from previous runs + cleanupOldTempFiles(); + + // Check Playwright installation + if (!checkPlaywrightInstalled()) { + const installed = installPlaywright(); + if (!installed) { + process.exit(1); + } + } + + // Get code to execute + const rawCode = getCodeToExecute(); + const code = wrapCodeIfNeeded(rawCode); + + // Create temporary file for execution + const tempFile = path.join(__dirname, `.temp-execution-${Date.now()}.js`); + + try { + // Write code to temp file + fs.writeFileSync(tempFile, code, 'utf8'); + + // Execute the code + console.log('πŸš€ Starting automation...\n'); + require(tempFile); + + // Note: Temp file will be cleaned up on next run + // This allows long-running async operations to complete safely + + } catch (error) { + console.error('❌ Execution failed:', error.message); + if (error.stack) { + console.error('\nπŸ“‹ Stack trace:'); + console.error(error.stack); + } + process.exit(1); + } +} + +// Run main function +main().catch(error => { + console.error('❌ Fatal error:', error.message); + process.exit(1); +}); diff --git a/components/skills/k8s-helm-charts-dev/SKILL.md b/components/skills/k8s-helm-charts-dev/SKILL.md new file mode 100644 index 0000000..db31ab1 --- /dev/null +++ b/components/skills/k8s-helm-charts-dev/SKILL.md @@ -0,0 +1,544 @@ +--- +name: helm-chart-scaffolding +description: Design, organize, and manage Helm charts for templating and packaging Kubernetes applications with reusable configurations. Use when creating Helm charts, packaging Kubernetes applications, or implementing templated deployments. +--- + +# Helm Chart Scaffolding + +Comprehensive guidance for creating, organizing, and managing Helm charts for packaging and deploying Kubernetes applications. + +## Purpose + +This skill provides step-by-step instructions for building production-ready Helm charts, including chart structure, templating patterns, values management, and validation strategies. + +## When to Use This Skill + +Use this skill when you need to: +- Create new Helm charts from scratch +- Package Kubernetes applications for distribution +- Manage multi-environment deployments with Helm +- Implement templating for reusable Kubernetes manifests +- Set up Helm chart repositories +- Follow Helm best practices and conventions + +## Helm Overview + +**Helm** is the package manager for Kubernetes that: +- Templates Kubernetes manifests for reusability +- Manages application releases and rollbacks +- Handles dependencies between charts +- Provides version control for deployments +- Simplifies configuration management across environments + +## Step-by-Step Workflow + +### 1. Initialize Chart Structure + +**Create new chart:** +```bash +helm create my-app +``` + +**Standard chart structure:** +``` +my-app/ +β”œβ”€β”€ Chart.yaml # Chart metadata +β”œβ”€β”€ values.yaml # Default configuration values +β”œβ”€β”€ charts/ # Chart dependencies +β”œβ”€β”€ templates/ # Kubernetes manifest templates +β”‚ β”œβ”€β”€ NOTES.txt # Post-install notes +β”‚ β”œβ”€β”€ _helpers.tpl # Template helpers +β”‚ β”œβ”€β”€ deployment.yaml +β”‚ β”œβ”€β”€ service.yaml +β”‚ β”œβ”€β”€ ingress.yaml +β”‚ β”œβ”€β”€ serviceaccount.yaml +β”‚ β”œβ”€β”€ hpa.yaml +β”‚ └── tests/ +β”‚ └── test-connection.yaml +└── .helmignore # Files to ignore +``` + +### 2. Configure Chart.yaml + +**Chart metadata defines the package:** + +```yaml +apiVersion: v2 +name: my-app +description: A Helm chart for My Application +type: application +version: 1.0.0 # Chart version +appVersion: "2.1.0" # Application version + +# Keywords for chart discovery +keywords: + - web + - api + - backend + +# Maintainer information +maintainers: + - name: DevOps Team + email: devops@example.com + url: https://github.com/example/my-app + +# Source code repository +sources: + - https://github.com/example/my-app + +# Homepage +home: https://example.com + +# Chart icon +icon: https://example.com/icon.png + +# Dependencies +dependencies: + - name: postgresql + version: "12.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: postgresql.enabled + - name: redis + version: "17.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: redis.enabled +``` + +**Reference:** See `assets/Chart.yaml.template` for complete example + +### 3. Design values.yaml Structure + +**Organize values hierarchically:** + +```yaml +# Image configuration +image: + repository: myapp + tag: "1.0.0" + pullPolicy: IfNotPresent + +# Number of replicas +replicaCount: 3 + +# Service configuration +service: + type: ClusterIP + port: 80 + targetPort: 8080 + +# Ingress configuration +ingress: + enabled: false + className: nginx + hosts: + - host: app.example.com + paths: + - path: / + pathType: Prefix + +# Resources +resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + +# Autoscaling +autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + +# Environment variables +env: + - name: LOG_LEVEL + value: "info" + +# ConfigMap data +configMap: + data: + APP_MODE: production + +# Dependencies +postgresql: + enabled: true + auth: + database: myapp + username: myapp + +redis: + enabled: false +``` + +**Reference:** See `assets/values.yaml.template` for complete structure + +### 4. Create Template Files + +**Use Go templating with Helm functions:** + +**templates/deployment.yaml:** +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "my-app.fullname" . }} + labels: + {{- include "my-app.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "my-app.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "my-app.selectorLabels" . | nindent 8 }} + spec: + containers: + - name: {{ .Chart.Name }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: http + containerPort: {{ .Values.service.targetPort }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + env: + {{- toYaml .Values.env | nindent 12 }} +``` + +### 5. Create Template Helpers + +**templates/_helpers.tpl:** +```yaml +{{/* +Expand the name of the chart. +*/}} +{{- define "my-app.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +*/}} +{{- define "my-app.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "my-app.labels" -}} +helm.sh/chart: {{ include "my-app.chart" . }} +{{ include "my-app.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "my-app.selectorLabels" -}} +app.kubernetes.io/name: {{ include "my-app.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} +``` + +### 6. Manage Dependencies + +**Add dependencies in Chart.yaml:** +```yaml +dependencies: + - name: postgresql + version: "12.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: postgresql.enabled +``` + +**Update dependencies:** +```bash +helm dependency update +helm dependency build +``` + +**Override dependency values:** +```yaml +# values.yaml +postgresql: + enabled: true + auth: + database: myapp + username: myapp + password: changeme + primary: + persistence: + enabled: true + size: 10Gi +``` + +### 7. Test and Validate + +**Validation commands:** +```bash +# Lint the chart +helm lint my-app/ + +# Dry-run installation +helm install my-app ./my-app --dry-run --debug + +# Template rendering +helm template my-app ./my-app + +# Template with values +helm template my-app ./my-app -f values-prod.yaml + +# Show computed values +helm show values ./my-app +``` + +**Validation script:** +```bash +#!/bin/bash +set -e + +echo "Linting chart..." +helm lint . + +echo "Testing template rendering..." +helm template test-release . --dry-run + +echo "Checking for required values..." +helm template test-release . --validate + +echo "All validations passed!" +``` + +**Reference:** See `scripts/validate-chart.sh` + +### 8. Package and Distribute + +**Package the chart:** +```bash +helm package my-app/ +# Creates: my-app-1.0.0.tgz +``` + +**Create chart repository:** +```bash +# Create index +helm repo index . + +# Upload to repository +# AWS S3 example +aws s3 sync . s3://my-helm-charts/ --exclude "*" --include "*.tgz" --include "index.yaml" +``` + +**Use the chart:** +```bash +helm repo add my-repo https://charts.example.com +helm repo update +helm install my-app my-repo/my-app +``` + +### 9. Multi-Environment Configuration + +**Environment-specific values files:** + +``` +my-app/ +β”œβ”€β”€ values.yaml # Defaults +β”œβ”€β”€ values-dev.yaml # Development +β”œβ”€β”€ values-staging.yaml # Staging +└── values-prod.yaml # Production +``` + +**values-prod.yaml:** +```yaml +replicaCount: 5 + +image: + tag: "2.1.0" + +resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "1Gi" + cpu: "1000m" + +autoscaling: + enabled: true + minReplicas: 3 + maxReplicas: 20 + +ingress: + enabled: true + hosts: + - host: app.example.com + paths: + - path: / + pathType: Prefix + +postgresql: + enabled: true + primary: + persistence: + size: 100Gi +``` + +**Install with environment:** +```bash +helm install my-app ./my-app -f values-prod.yaml --namespace production +``` + +### 10. Implement Hooks and Tests + +**Pre-install hook:** +```yaml +# templates/pre-install-job.yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "my-app.fullname" . }}-db-setup + annotations: + "helm.sh/hook": pre-install + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded +spec: + template: + spec: + containers: + - name: db-setup + image: postgres:15 + command: ["psql", "-c", "CREATE DATABASE myapp"] + restartPolicy: Never +``` + +**Test connection:** +```yaml +# templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "my-app.fullname" . }}-test-connection" + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "my-app.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +``` + +**Run tests:** +```bash +helm test my-app +``` + +## Common Patterns + +### Pattern 1: Conditional Resources + +```yaml +{{- if .Values.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "my-app.fullname" . }} +spec: + # ... +{{- end }} +``` + +### Pattern 2: Iterating Over Lists + +```yaml +env: +{{- range .Values.env }} +- name: {{ .name }} + value: {{ .value | quote }} +{{- end }} +``` + +### Pattern 3: Including Files + +```yaml +data: + config.yaml: | + {{- .Files.Get "config/application.yaml" | nindent 4 }} +``` + +### Pattern 4: Global Values + +```yaml +global: + imageRegistry: docker.io + imagePullSecrets: + - name: regcred + +# Use in templates: +image: {{ .Values.global.imageRegistry }}/{{ .Values.image.repository }} +``` + +## Best Practices + +1. **Use semantic versioning** for chart and app versions +2. **Document all values** in values.yaml with comments +3. **Use template helpers** for repeated logic +4. **Validate charts** before packaging +5. **Pin dependency versions** explicitly +6. **Use conditions** for optional resources +7. **Follow naming conventions** (lowercase, hyphens) +8. **Include NOTES.txt** with usage instructions +9. **Add labels** consistently using helpers +10. **Test installations** in all environments + +## Troubleshooting + +**Template rendering errors:** +```bash +helm template my-app ./my-app --debug +``` + +**Dependency issues:** +```bash +helm dependency update +helm dependency list +``` + +**Installation failures:** +```bash +helm install my-app ./my-app --dry-run --debug +kubectl get events --sort-by='.lastTimestamp' +``` + +## Reference Files + +- `assets/Chart.yaml.template` - Chart metadata template +- `assets/values.yaml.template` - Values structure template +- `scripts/validate-chart.sh` - Validation script +- `references/chart-structure.md` - Detailed chart organization + +## Related Skills + +- `k8s-manifest-generator` - For creating base Kubernetes manifests +- `gitops-workflow` - For automated Helm chart deployments diff --git a/components/skills/k8s-helm-charts-dev/assets/Chart.yaml.template b/components/skills/k8s-helm-charts-dev/assets/Chart.yaml.template new file mode 100644 index 0000000..74dfe6e --- /dev/null +++ b/components/skills/k8s-helm-charts-dev/assets/Chart.yaml.template @@ -0,0 +1,42 @@ +apiVersion: v2 +name: +description: +type: application +version: 0.1.0 +appVersion: "1.0.0" + +keywords: + - + - + +home: https://github.com// + +sources: + - https://github.com// + +maintainers: + - name: + email: + url: https://github.com/ + +icon: https://example.com/icon.png + +kubeVersion: ">=1.24.0" + +dependencies: + - name: postgresql + version: "12.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: postgresql.enabled + tags: + - database + - name: redis + version: "17.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: redis.enabled + tags: + - cache + +annotations: + category: Application + licenses: Apache-2.0 diff --git a/components/skills/k8s-helm-charts-dev/assets/values.yaml.template b/components/skills/k8s-helm-charts-dev/assets/values.yaml.template new file mode 100644 index 0000000..117c1e5 --- /dev/null +++ b/components/skills/k8s-helm-charts-dev/assets/values.yaml.template @@ -0,0 +1,185 @@ +# Global values shared with subcharts +global: + imageRegistry: docker.io + imagePullSecrets: [] + storageClass: "" + +# Image configuration +image: + registry: docker.io + repository: myapp/web + tag: "" # Defaults to .Chart.AppVersion + pullPolicy: IfNotPresent + +# Override chart name +nameOverride: "" +fullnameOverride: "" + +# Number of replicas +replicaCount: 3 +revisionHistoryLimit: 10 + +# ServiceAccount +serviceAccount: + create: true + annotations: {} + name: "" + +# Pod annotations +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9090" + prometheus.io/path: "/metrics" + +# Pod security context +podSecurityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + +# Container security context +securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +# Service configuration +service: + type: ClusterIP + port: 80 + targetPort: http + annotations: {} + sessionAffinity: None + +# Ingress configuration +ingress: + enabled: false + className: nginx + annotations: {} + hosts: + - host: app.example.com + paths: + - path: / + pathType: Prefix + tls: [] + +# Resources +resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 250m + memory: 256Mi + +# Liveness probe +livenessProbe: + httpGet: + path: /health/live + port: http + initialDelaySeconds: 30 + periodSeconds: 10 + +# Readiness probe +readinessProbe: + httpGet: + path: /health/ready + port: http + initialDelaySeconds: 5 + periodSeconds: 5 + +# Autoscaling +autoscaling: + enabled: false + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + +# Pod Disruption Budget +podDisruptionBudget: + enabled: true + minAvailable: 1 + +# Node selection +nodeSelector: {} +tolerations: [] +affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - '{{ include "my-app.name" . }}' + topologyKey: kubernetes.io/hostname + +# Environment variables +env: [] +# - name: LOG_LEVEL +# value: "info" + +# ConfigMap data +configMap: + enabled: true + data: {} +# APP_MODE: production +# DATABASE_HOST: postgres.example.com + +# Secrets (use external secret management in production) +secrets: + enabled: false + data: {} + +# Persistent Volume +persistence: + enabled: false + storageClass: "" + accessMode: ReadWriteOnce + size: 10Gi + annotations: {} + +# PostgreSQL dependency +postgresql: + enabled: false + auth: + database: myapp + username: myapp + password: changeme + primary: + persistence: + enabled: true + size: 10Gi + +# Redis dependency +redis: + enabled: false + auth: + enabled: false + master: + persistence: + enabled: false + +# ServiceMonitor for Prometheus Operator +serviceMonitor: + enabled: false + interval: 30s + scrapeTimeout: 10s + labels: {} + +# Network Policy +networkPolicy: + enabled: false + policyTypes: + - Ingress + - Egress + ingress: [] + egress: [] diff --git a/components/skills/k8s-helm-charts-dev/references/chart-structure.md b/components/skills/k8s-helm-charts-dev/references/chart-structure.md new file mode 100644 index 0000000..2b8769a --- /dev/null +++ b/components/skills/k8s-helm-charts-dev/references/chart-structure.md @@ -0,0 +1,500 @@ +# Helm Chart Structure Reference + +Complete guide to Helm chart organization, file conventions, and best practices. + +## Standard Chart Directory Structure + +``` +my-app/ +β”œβ”€β”€ Chart.yaml # Chart metadata (required) +β”œβ”€β”€ Chart.lock # Dependency lock file (generated) +β”œβ”€β”€ values.yaml # Default configuration values (required) +β”œβ”€β”€ values.schema.json # JSON schema for values validation +β”œβ”€β”€ .helmignore # Patterns to ignore when packaging +β”œβ”€β”€ README.md # Chart documentation +β”œβ”€β”€ LICENSE # Chart license +β”œβ”€β”€ charts/ # Chart dependencies (bundled) +β”‚ └── postgresql-12.0.0.tgz +β”œβ”€β”€ crds/ # Custom Resource Definitions +β”‚ └── my-crd.yaml +β”œβ”€β”€ templates/ # Kubernetes manifest templates (required) +β”‚ β”œβ”€β”€ NOTES.txt # Post-install instructions +β”‚ β”œβ”€β”€ _helpers.tpl # Template helper functions +β”‚ β”œβ”€β”€ deployment.yaml +β”‚ β”œβ”€β”€ service.yaml +β”‚ β”œβ”€β”€ ingress.yaml +β”‚ β”œβ”€β”€ configmap.yaml +β”‚ β”œβ”€β”€ secret.yaml +β”‚ β”œβ”€β”€ serviceaccount.yaml +β”‚ β”œβ”€β”€ hpa.yaml +β”‚ β”œβ”€β”€ pdb.yaml +β”‚ β”œβ”€β”€ networkpolicy.yaml +β”‚ └── tests/ +β”‚ └── test-connection.yaml +└── files/ # Additional files to include + └── config/ + └── app.conf +``` + +## Chart.yaml Specification + +### API Version v2 (Helm 3+) + +```yaml +apiVersion: v2 # Required: API version +name: my-application # Required: Chart name +version: 1.2.3 # Required: Chart version (SemVer) +appVersion: "2.5.0" # Application version +description: A Helm chart for my application # Required +type: application # Chart type: application or library +keywords: # Search keywords + - web + - api + - backend +home: https://example.com # Project home page +sources: # Source code URLs + - https://github.com/example/my-app +maintainers: # Maintainer list + - name: John Doe + email: john@example.com + url: https://github.com/johndoe +icon: https://example.com/icon.png # Chart icon URL +kubeVersion: ">=1.24.0" # Compatible Kubernetes versions +deprecated: false # Mark chart as deprecated +annotations: # Arbitrary annotations + example.com/release-notes: https://example.com/releases/v1.2.3 +dependencies: # Chart dependencies + - name: postgresql + version: "12.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: postgresql.enabled + tags: + - database + import-values: + - child: database + parent: database + alias: db +``` + +## Chart Types + +### Application Chart +```yaml +type: application +``` +- Standard Kubernetes applications +- Can be installed and managed +- Contains templates for K8s resources + +### Library Chart +```yaml +type: library +``` +- Shared template helpers +- Cannot be installed directly +- Used as dependency by other charts +- No templates/ directory + +## Values Files Organization + +### values.yaml (defaults) +```yaml +# Global values (shared with subcharts) +global: + imageRegistry: docker.io + imagePullSecrets: [] + +# Image configuration +image: + registry: docker.io + repository: myapp/web + tag: "" # Defaults to .Chart.AppVersion + pullPolicy: IfNotPresent + +# Deployment settings +replicaCount: 1 +revisionHistoryLimit: 10 + +# Pod configuration +podAnnotations: {} +podSecurityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + +# Container security +securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + +# Service +service: + type: ClusterIP + port: 80 + targetPort: http + annotations: {} + +# Resources +resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + +# Autoscaling +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + +# Node selection +nodeSelector: {} +tolerations: [] +affinity: {} + +# Monitoring +serviceMonitor: + enabled: false + interval: 30s +``` + +### values.schema.json (validation) +```json +{ + "$schema": "https://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "replicaCount": { + "type": "integer", + "minimum": 1 + }, + "image": { + "type": "object", + "required": ["repository"], + "properties": { + "repository": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "pullPolicy": { + "type": "string", + "enum": ["Always", "IfNotPresent", "Never"] + } + } + } + }, + "required": ["image"] +} +``` + +## Template Files + +### Template Naming Conventions + +- **Lowercase with hyphens**: `deployment.yaml`, `service-account.yaml` +- **Partial templates**: Prefix with underscore `_helpers.tpl` +- **Tests**: Place in `templates/tests/` +- **CRDs**: Place in `crds/` (not templated) + +### Common Templates + +#### _helpers.tpl +```yaml +{{/* +Standard naming helpers +*/}} +{{- define "my-app.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "my-app.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "my-app.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "my-app.labels" -}} +helm.sh/chart: {{ include "my-app.chart" . }} +{{ include "my-app.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{- define "my-app.selectorLabels" -}} +app.kubernetes.io/name: {{ include "my-app.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Image name helper +*/}} +{{- define "my-app.image" -}} +{{- $registry := .Values.global.imageRegistry | default .Values.image.registry -}} +{{- $repository := .Values.image.repository -}} +{{- $tag := .Values.image.tag | default .Chart.AppVersion -}} +{{- printf "%s/%s:%s" $registry $repository $tag -}} +{{- end -}} +``` + +#### NOTES.txt +``` +Thank you for installing {{ .Chart.Name }}. + +Your release is named {{ .Release.Name }}. + +To learn more about the release, try: + + $ helm status {{ .Release.Name }} + $ helm get all {{ .Release.Name }} + +{{- if .Values.ingress.enabled }} + +Application URL: +{{- range .Values.ingress.hosts }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ .host }}{{ .path }} +{{- end }} +{{- else }} + +Get the application URL by running: + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "my-app.name" . }}" -o jsonpath="{.items[0].metadata.name}") + kubectl port-forward $POD_NAME 8080:80 + echo "Visit http://127.0.0.1:8080" +{{- end }} +``` + +## Dependencies Management + +### Declaring Dependencies + +```yaml +# Chart.yaml +dependencies: + - name: postgresql + version: "12.0.0" + repository: "https://charts.bitnami.com/bitnami" + condition: postgresql.enabled # Enable/disable via values + tags: # Group dependencies + - database + import-values: # Import values from subchart + - child: database + parent: database + alias: db # Reference as .Values.db +``` + +### Managing Dependencies + +```bash +# Update dependencies +helm dependency update + +# List dependencies +helm dependency list + +# Build dependencies +helm dependency build +``` + +### Chart.lock + +Generated automatically by `helm dependency update`: + +```yaml +dependencies: +- name: postgresql + repository: https://charts.bitnami.com/bitnami + version: 12.0.0 +digest: sha256:abcd1234... +generated: "2024-01-01T00:00:00Z" +``` + +## .helmignore + +Exclude files from chart package: + +``` +# Development files +.git/ +.gitignore +*.md +docs/ + +# Build artifacts +*.swp +*.bak +*.tmp +*.orig + +# CI/CD +.travis.yml +.gitlab-ci.yml +Jenkinsfile + +# Testing +test/ +*.test + +# IDE +.vscode/ +.idea/ +*.iml +``` + +## Custom Resource Definitions (CRDs) + +Place CRDs in `crds/` directory: + +``` +crds/ +β”œβ”€β”€ my-app-crd.yaml +└── another-crd.yaml +``` + +**Important CRD notes:** +- CRDs are installed before any templates +- CRDs are NOT templated (no `{{ }}` syntax) +- CRDs are NOT upgraded or deleted with chart +- Use `helm install --skip-crds` to skip installation + +## Chart Versioning + +### Semantic Versioning + +- **Chart Version**: Increment when chart changes + - MAJOR: Breaking changes + - MINOR: New features, backward compatible + - PATCH: Bug fixes + +- **App Version**: Application version being deployed + - Can be any string + - Not required to follow SemVer + +```yaml +version: 2.3.1 # Chart version +appVersion: "1.5.0" # Application version +``` + +## Chart Testing + +### Test Files + +```yaml +# templates/tests/test-connection.yaml +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "my-app.fullname" . }}-test-connection" + annotations: + "helm.sh/hook": test + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "my-app.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +``` + +### Running Tests + +```bash +helm test my-release +helm test my-release --logs +``` + +## Hooks + +Helm hooks allow intervention at specific points: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "my-app.fullname" . }}-migration + annotations: + "helm.sh/hook": pre-upgrade,pre-install + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded +``` + +### Hook Types + +- `pre-install`: Before templates rendered +- `post-install`: After all resources loaded +- `pre-delete`: Before any resources deleted +- `post-delete`: After all resources deleted +- `pre-upgrade`: Before upgrade +- `post-upgrade`: After upgrade +- `pre-rollback`: Before rollback +- `post-rollback`: After rollback +- `test`: Run with `helm test` + +### Hook Weight + +Controls hook execution order (-5 to 5, lower runs first) + +### Hook Deletion Policies + +- `before-hook-creation`: Delete previous hook before new one +- `hook-succeeded`: Delete after successful execution +- `hook-failed`: Delete if hook fails + +## Best Practices + +1. **Use helpers** for repeated template logic +2. **Quote strings** in templates: `{{ .Values.name | quote }}` +3. **Validate values** with values.schema.json +4. **Document all values** in values.yaml +5. **Use semantic versioning** for chart versions +6. **Pin dependency versions** exactly +7. **Include NOTES.txt** with usage instructions +8. **Add tests** for critical functionality +9. **Use hooks** for database migrations +10. **Keep charts focused** - one application per chart + +## Chart Repository Structure + +``` +helm-charts/ +β”œβ”€β”€ index.yaml +β”œβ”€β”€ my-app-1.0.0.tgz +β”œβ”€β”€ my-app-1.1.0.tgz +β”œβ”€β”€ my-app-1.2.0.tgz +└── another-chart-2.0.0.tgz +``` + +### Creating Repository Index + +```bash +helm repo index . --url https://charts.example.com +``` + +## Related Resources + +- [Helm Documentation](https://helm.sh/docs/) +- [Chart Template Guide](https://helm.sh/docs/chart_template_guide/) +- [Best Practices](https://helm.sh/docs/chart_best_practices/) diff --git a/components/skills/k8s-helm-charts-dev/scripts/validate-chart.sh b/components/skills/k8s-helm-charts-dev/scripts/validate-chart.sh new file mode 100755 index 0000000..b8d5b0f --- /dev/null +++ b/components/skills/k8s-helm-charts-dev/scripts/validate-chart.sh @@ -0,0 +1,244 @@ +#!/bin/bash +set -e + +CHART_DIR="${1:-.}" +RELEASE_NAME="test-release" + +echo "═══════════════════════════════════════════════════════" +echo " Helm Chart Validation" +echo "═══════════════════════════════════════════════════════" +echo "" + +# Colors +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +RED='\033[0;31m' +NC='\033[0m' # No Color + +success() { + echo -e "${GREEN}βœ“${NC} $1" +} + +warning() { + echo -e "${YELLOW}⚠${NC} $1" +} + +error() { + echo -e "${RED}βœ—${NC} $1" +} + +# Check if Helm is installed +if ! command -v helm &> /dev/null; then + error "Helm is not installed" + exit 1 +fi + +echo "πŸ“¦ Chart directory: $CHART_DIR" +echo "" + +# 1. Check chart structure +echo "1️⃣ Checking chart structure..." +if [ ! -f "$CHART_DIR/Chart.yaml" ]; then + error "Chart.yaml not found" + exit 1 +fi +success "Chart.yaml exists" + +if [ ! -f "$CHART_DIR/values.yaml" ]; then + error "values.yaml not found" + exit 1 +fi +success "values.yaml exists" + +if [ ! -d "$CHART_DIR/templates" ]; then + error "templates/ directory not found" + exit 1 +fi +success "templates/ directory exists" +echo "" + +# 2. Lint the chart +echo "2️⃣ Linting chart..." +if helm lint "$CHART_DIR"; then + success "Chart passed lint" +else + error "Chart failed lint" + exit 1 +fi +echo "" + +# 3. Check Chart.yaml +echo "3️⃣ Validating Chart.yaml..." +CHART_NAME=$(grep "^name:" "$CHART_DIR/Chart.yaml" | awk '{print $2}') +CHART_VERSION=$(grep "^version:" "$CHART_DIR/Chart.yaml" | awk '{print $2}') +APP_VERSION=$(grep "^appVersion:" "$CHART_DIR/Chart.yaml" | awk '{print $2}' | tr -d '"') + +if [ -z "$CHART_NAME" ]; then + error "Chart name not found" + exit 1 +fi +success "Chart name: $CHART_NAME" + +if [ -z "$CHART_VERSION" ]; then + error "Chart version not found" + exit 1 +fi +success "Chart version: $CHART_VERSION" + +if [ -z "$APP_VERSION" ]; then + warning "App version not specified" +else + success "App version: $APP_VERSION" +fi +echo "" + +# 4. Test template rendering +echo "4️⃣ Testing template rendering..." +if helm template "$RELEASE_NAME" "$CHART_DIR" > /dev/null 2>&1; then + success "Templates rendered successfully" +else + error "Template rendering failed" + helm template "$RELEASE_NAME" "$CHART_DIR" + exit 1 +fi +echo "" + +# 5. Dry-run installation +echo "5️⃣ Testing dry-run installation..." +if helm install "$RELEASE_NAME" "$CHART_DIR" --dry-run --debug > /dev/null 2>&1; then + success "Dry-run installation successful" +else + error "Dry-run installation failed" + exit 1 +fi +echo "" + +# 6. Check for required Kubernetes resources +echo "6️⃣ Checking generated resources..." +MANIFESTS=$(helm template "$RELEASE_NAME" "$CHART_DIR") + +if echo "$MANIFESTS" | grep -q "kind: Deployment"; then + success "Deployment found" +else + warning "No Deployment found" +fi + +if echo "$MANIFESTS" | grep -q "kind: Service"; then + success "Service found" +else + warning "No Service found" +fi + +if echo "$MANIFESTS" | grep -q "kind: ServiceAccount"; then + success "ServiceAccount found" +else + warning "No ServiceAccount found" +fi +echo "" + +# 7. Check for security best practices +echo "7️⃣ Checking security best practices..." +if echo "$MANIFESTS" | grep -q "runAsNonRoot: true"; then + success "Running as non-root user" +else + warning "Not explicitly running as non-root" +fi + +if echo "$MANIFESTS" | grep -q "readOnlyRootFilesystem: true"; then + success "Using read-only root filesystem" +else + warning "Not using read-only root filesystem" +fi + +if echo "$MANIFESTS" | grep -q "allowPrivilegeEscalation: false"; then + success "Privilege escalation disabled" +else + warning "Privilege escalation not explicitly disabled" +fi +echo "" + +# 8. Check for resource limits +echo "8️⃣ Checking resource configuration..." +if echo "$MANIFESTS" | grep -q "resources:"; then + if echo "$MANIFESTS" | grep -q "limits:"; then + success "Resource limits defined" + else + warning "No resource limits defined" + fi + if echo "$MANIFESTS" | grep -q "requests:"; then + success "Resource requests defined" + else + warning "No resource requests defined" + fi +else + warning "No resources defined" +fi +echo "" + +# 9. Check for health probes +echo "9️⃣ Checking health probes..." +if echo "$MANIFESTS" | grep -q "livenessProbe:"; then + success "Liveness probe configured" +else + warning "No liveness probe found" +fi + +if echo "$MANIFESTS" | grep -q "readinessProbe:"; then + success "Readiness probe configured" +else + warning "No readiness probe found" +fi +echo "" + +# 10. Check dependencies +if [ -f "$CHART_DIR/Chart.yaml" ] && grep -q "^dependencies:" "$CHART_DIR/Chart.yaml"; then + echo "πŸ”Ÿ Checking dependencies..." + if helm dependency list "$CHART_DIR" > /dev/null 2>&1; then + success "Dependencies valid" + + if [ -f "$CHART_DIR/Chart.lock" ]; then + success "Chart.lock file present" + else + warning "Chart.lock file missing (run 'helm dependency update')" + fi + else + error "Dependencies check failed" + fi + echo "" +fi + +# 11. Check for values schema +if [ -f "$CHART_DIR/values.schema.json" ]; then + echo "1️⃣1️⃣ Validating values schema..." + success "values.schema.json present" + + # Validate schema if jq is available + if command -v jq &> /dev/null; then + if jq empty "$CHART_DIR/values.schema.json" 2>/dev/null; then + success "values.schema.json is valid JSON" + else + error "values.schema.json contains invalid JSON" + exit 1 + fi + fi + echo "" +fi + +# Summary +echo "═══════════════════════════════════════════════════════" +echo " Validation Complete!" +echo "═══════════════════════════════════════════════════════" +echo "" +echo "Chart: $CHART_NAME" +echo "Version: $CHART_VERSION" +if [ -n "$APP_VERSION" ]; then + echo "App Version: $APP_VERSION" +fi +echo "" +success "All validations passed!" +echo "" +echo "Next steps:" +echo " β€’ helm package $CHART_DIR" +echo " β€’ helm install my-release $CHART_DIR" +echo " β€’ helm test my-release" +echo "" diff --git a/components/skills/lang-elixir-patterns-eng/SKILL.md b/components/skills/lang-elixir-patterns-eng/SKILL.md new file mode 100644 index 0000000..a48dcf1 --- /dev/null +++ b/components/skills/lang-elixir-patterns-eng/SKILL.md @@ -0,0 +1,1017 @@ +--- +name: elixir-architect +description: Use when designing or architecting Elixir/Phoenix applications, creating comprehensive project documentation, planning OTP supervision trees, defining domain models with Ash Framework, structuring multi-app projects with path-based dependencies, or preparing handoff documentation for Director/Implementor AI collaboration +--- + +# Elixir Project Architect + +You are an expert Elixir/OTP system architect specializing in creating production-ready systems with comprehensive documentation. You create complete documentation packages that enable Director and Implementor AI agents to successfully build complex systems following best practices from Dave Thomas, SaΕ‘a JuriΔ‡, and the Elixir community. + +## Core Principles + +1. **Database as Source of Truth** - No GenServers for domain entities +2. **Functional Core, Imperative Shell** - Pure business logic in impl/ layer +3. **Let It Crash** - Supervision trees for fault tolerance +4. **Dave Thomas Structure** - Path-based dependencies, not umbrella apps +5. **Ash Framework First** - Declarative domain modeling with auto-generated APIs +6. **Oban for Async** - Never block request path with external calls +7. **Test-Driven Development** - Write tests first, always + +## When to Use This Skill + +Invoke this skill when you need to: + +- Design a new Elixir/Phoenix application from scratch +- Create comprehensive architecture documentation +- Plan OTP supervision trees and process architecture +- Define domain models with Ash Framework resources +- Structure multi-app projects (Dave Thomas style) +- Create Architecture Decision Records (ADRs) +- Prepare handoff documentation for AI agent collaboration +- Set up guardrails for Director/Implementor AI workflows +- Design financial systems, e-commerce platforms, or SaaS applications +- Plan background job processing with Oban +- Structure event-driven systems with GenStage/Broadway + +## Your Process + +### Phase 1: Gather Requirements + +Ask the user these essential questions: + +1. **Project Domain**: What is the system for? (e.g., task management, e-commerce, SaaS, messaging platform) +2. **Tech Stack**: Confirm Elixir + OTP + Ash + Oban + Phoenix + LiveView? +3. **Project Location**: Where should files be created? (provide absolute path) +4. **Structure Style**: Dave Thomas path-based dependencies or umbrella app? +5. **Special Requirements**: + - Multi-tenancy needed? + - Event sourcing or CQRS? + - External integrations (payment processors, APIs)? + - Real-time features (WebSockets, LiveView)? + - Background processing needs? +6. **Scale Targets**: Expected load, users, transactions per second? +7. **AI Collaboration**: Will Director and Implementor AIs be used? + +### Phase 2: Expert Consultation + +Launch parallel Task agents to research: + +1. **Domain Patterns** - Research similar systems and proven architectures +2. **Framework Best Practices** - Ash Framework, Oban, Phoenix patterns +3. **Book Knowledge** - Extract wisdom from available Elixir books +4. **Structure Analysis** - Study Dave Thomas's multi-app approach +5. **Superpowers Framework** - If handoff docs needed, research task breakdown format + +Example Task invocations: +``` +Task 1: Research [domain] architecture patterns and data models +Task 2: Analyze Ash Framework resource patterns, extensions, and best practices +Task 3: Study Dave Thomas's path-based dependency approach from available projects +Task 4: Research Superpowers framework for implementation plan format +``` + +### Phase 3: Create Directory Structure + +Create this structure at the user-specified location: + +``` +project_root/ +β”œβ”€β”€ README.md +β”œβ”€β”€ CLAUDE.md +β”œβ”€β”€ docs/ +β”‚ β”œβ”€β”€ HANDOFF.md +β”‚ β”œβ”€β”€ architecture/ +β”‚ β”‚ β”œβ”€β”€ 00_SYSTEM_OVERVIEW.md +β”‚ β”‚ β”œβ”€β”€ 01_DOMAIN_MODEL.md +β”‚ β”‚ β”œβ”€β”€ 02_DATA_LAYER.md +β”‚ β”‚ β”œβ”€β”€ 03_FUNCTIONAL_CORE.md +β”‚ β”‚ β”œβ”€β”€ 04_BOUNDARIES.md +β”‚ β”‚ β”œβ”€β”€ 05_LIFECYCLE.md +β”‚ β”‚ β”œβ”€β”€ 06_WORKERS.md +β”‚ β”‚ └── 07_INTEGRATION_PATTERNS.md +β”‚ β”œβ”€β”€ design/ # Empty - Director AI fills during feature work +β”‚ β”œβ”€β”€ plans/ # Empty - Director AI creates Superpowers plans +β”‚ β”œβ”€β”€ api/ # Empty - Director AI documents API contracts +β”‚ β”œβ”€β”€ decisions/ # ADRs +β”‚ β”‚ β”œβ”€β”€ ADR-001-framework-choice.md +β”‚ β”‚ β”œβ”€β”€ ADR-002-id-strategy.md +β”‚ β”‚ β”œβ”€β”€ ADR-003-process-architecture.md +β”‚ β”‚ └── [domain-specific ADRs] +β”‚ └── guardrails/ +β”‚ β”œβ”€β”€ NEVER_DO.md +β”‚ β”œβ”€β”€ ALWAYS_DO.md +β”‚ β”œβ”€β”€ DIRECTOR_ROLE.md +β”‚ β”œβ”€β”€ IMPLEMENTOR_ROLE.md +β”‚ └── CODE_REVIEW_CHECKLIST.md +``` + +### Phase 4: Foundation Documentation + +#### README.md Structure + +```markdown +# [Project Name] + +[One-line description] + +## Overview +[2-3 paragraphs: what this system does and why] + +## Architecture +This project follows Dave Thomas's multi-app structure: + +project_root/ +β”œβ”€β”€ [app_name]_core/ # Domain logic (Ash resources, pure functions) +β”œβ”€β”€ [app_name]_api/ # REST/GraphQL APIs (Phoenix) +β”œβ”€β”€ [app_name]_jobs/ # Background jobs (Oban workers) +β”œβ”€β”€ [app_name]_events/ # Event streaming (Broadway) +└── [app_name]_admin/ # Admin UI (LiveView) + +## Tech Stack +- **Elixir** 1.17+ with OTP 27+ +- **Ash Framework** 3.0+ - Declarative domain modeling +- **Oban** 2.17+ - Background job processing +- **Phoenix** 1.7+ - Web framework +- **PostgreSQL** 16+ - Primary database + +## Getting Started +[Setup instructions] + +## Development +[Common tasks, testing, etc.] + +## Documentation +See `docs/` directory for comprehensive architecture documentation. +``` + +#### CLAUDE.md - Critical AI Context + +Must include these sections with concrete examples: + +1. **Project Context** - System purpose and domain +2. **Hybrid Design Philosophy** - Pattern sources +3. **Key Architectural Decisions** - With trade-offs +4. **Database as Source of Truth** - Why no GenServers for entities +5. **Code Conventions** - Naming, structure, organization +6. **Money Handling** - Never floats! Use integers (cents) or Decimal +7. **Testing Patterns** - Unit/Integration/Property tests +8. **AI Agent Roles** - Director vs Implementor boundaries +9. **Common Mistakes** - Anti-patterns with corrections + +Example money handling section: +```elixir +# ❌ NEVER +attribute :amount, :float + +# βœ… ALWAYS +attribute :amount, :integer # Store cents: 100_00 = $100.00 +attribute :balance, :decimal # Or use Decimal for precision + +# Why: 0.1 + 0.2 != 0.3 in floating point! +``` + +### Phase 5: Guardrails Documentation + +Create 5 critical files: + +#### 1. NEVER_DO.md (10 Prohibitions) + +Template structure: +```markdown +# NEVER DO: Critical Prohibitions + +## 1. Never Use Floats for Money +❌ **NEVER**: `attribute :amount, :float` +βœ… **ALWAYS**: `attribute :amount, :integer` or `attribute :balance, :decimal` +**Why**: Float precision errors cause incorrect financial calculations + +## 2. Never Update Balance Without Version Check +❌ **NEVER**: Direct update without optimistic locking +βœ… **ALWAYS**: Check version field for concurrent updates +**Why**: Prevents lost updates in concurrent scenarios + +[... 8 more critical prohibitions with code examples ...] +``` + +Include prohibitions for: +- Float usage for money +- Missing version checks (optimistic locking) +- GenServers for domain entities +- Partial transaction commits +- Skipping double-entry validation (if financial) +- Synchronous external API calls in request path +- Storing financial state in process memory +- Mutable data structures +- Logging sensitive data +- Direct user input in queries (SQL injection) + +#### 2. ALWAYS_DO.md (22 Mandatory Practices) + +Categories: +- **Data Integrity**: Transactions, events, ULIDs, audit trail +- **Testing**: TDD, edge cases, concurrent scenarios, property tests +- **Code Quality**: Typespecs, documentation, commits, DRY, YAGNI +- **Architecture**: Separation of concerns, Ash Actions, Oban, GenStage + +Example: +```elixir +# βœ… ALWAYS wrap multi-step operations in transactions +Multi.new() +|> Multi.insert(:transaction, transaction_changeset) +|> Multi.run(:operations, fn _repo, %{transaction: txn} -> + create_operations(txn.id, params) +end) +|> Multi.run(:update_balances, fn _repo, %{operations: ops} -> + update_balances(ops) +end) +|> Repo.transaction() +``` + +#### 3. DIRECTOR_ROLE.md + +Define Director AI responsibilities: +- Architecture decisions +- Design documentation +- Implementation planning (Superpowers format) +- Code review against design +- Maintaining consistency + +Include: +- What Director CAN do (document, design, plan, review) +- What Director CANNOT do (implement, code, execute) +- Decision authority matrix +- Communication protocol with templates +- Quality gates + +#### 4. IMPLEMENTOR_ROLE.md + +Define Implementor AI responsibilities: +- Execute implementation plans +- Write tests first (TDD) +- Maintain code quality +- Report progress/blockers + +Include: +- What Implementor CAN do (code, test, tactical decisions) +- What Implementor CANNOT do (architecture, design changes) +- When to stop and ask Director +- TDD workflow with examples +- Code quality checklist + +#### 5. CODE_REVIEW_CHECKLIST.md + +Comprehensive checklist covering: +- Correctness (logic, error handling) +- Financial Integrity (if applicable: double-entry, balances, audit trail) +- Data Integrity (transactions, optimistic locking, constraints) +- Security (input validation, secrets, SQL injection) +- Testing (coverage, edge cases, property tests) +- Code Quality (typespecs, docs, formatting, Credo) +- Documentation (moduledocs, function docs, examples) +- Performance (N+1 queries, indexes, caching) +- Architecture (layering, separation, patterns) + +### Phase 6: Architecture Documentation (8 Files) + +#### 00_SYSTEM_OVERVIEW.md +- Vision and goals +- High-level architecture diagram (ASCII art is fine) +- Component overview (apps and their purposes) +- Data flow diagrams +- Technology justification (why Ash, why Oban, why PostgreSQL) +- Scalability strategy (read replicas, caching, partitioning) +- Security approach (authentication, authorization, secrets) +- Performance targets with specific metrics + +#### 01_DOMAIN_MODEL.md +- All domain entities with complete field definitions +- Relationships between entities (has_many, belongs_to) +- Business rules and constraints +- State machines (if applicable, with ASCII diagrams) +- Use cases with concrete code examples +- Entity lifecycle explanations + +Example entity: +```elixir +%Task{ + id: "tsk_01HQBMB5KTQNDRPQHM3VXDT2E9K", # ULID with prefix + project_id: "prj_01HQBMA5KTQNDRPQHM3VXDT2E9K", + title: "Implement user authentication", + description: "Add JWT-based auth with refresh tokens", + status: :in_progress, # :todo | :in_progress | :blocked | :review | :done + priority: :high, # :low | :medium | :high | :urgent + assignee_id: "usr_01HQBMB5KTQNDRPQHM3VXDT2E9K", + due_date: ~D[2024-02-01], + estimated_hours: 8, + version: 1, + inserted_at: ~U[2024-01-01 00:00:00Z], + updated_at: ~U[2024-01-01 00:00:00Z] +} +``` + +#### 02_DATA_LAYER.md +- Complete Ash Resource definitions for all entities +- PostgreSQL table schemas +- Indexes and their justifications +- Optimistic locking implementation (version fields) +- Performance considerations +- Migration strategy + +Example Ash Resource: +```elixir +defmodule TaskManager.Task do + use Ash.Resource, + domain: TaskManager, + data_layer: AshPostgres.DataLayer, + extensions: [AshPaperTrail] + + postgres do + table "tasks" + repo TaskManager.Repo + end + + attributes do + uuid_v7_primary_key :id, prefix: "tsk" + attribute :title, :string, allow_nil?: false + attribute :description, :string + attribute :status, :atom, + constraints: [one_of: [:todo, :in_progress, :blocked, :review, :done]], + default: :todo + attribute :priority, :atom, + constraints: [one_of: [:low, :medium, :high, :urgent]], + default: :medium + attribute :due_date, :date + attribute :estimated_hours, :integer + attribute :version, :integer, default: 1 + timestamps() + end + + relationships do + belongs_to :project, TaskManager.Project + belongs_to :assignee, TaskManager.User + has_many :comments, TaskManager.Comment + end + + actions do + defaults [:read, :destroy] + + create :create do + accept [:title, :description, :status, :priority, :project_id, :assignee_id] + change fn changeset, _ -> + Ash.Changeset.force_change_attribute(changeset, :status, :todo) + end + end + + update :update_with_version do + accept [:title, :description, :status, :priority, :assignee_id, :due_date] + require_atomic? false + change optimistic_lock(:version) + end + + update :assign do + accept [:assignee_id] + change optimistic_lock(:version) + end + + update :transition_status do + accept [:status] + validate fn changeset, _ -> + # Validate state machine transitions + validate_status_transition(changeset) + end + change optimistic_lock(:version) + end + end +end +``` + +#### 03_FUNCTIONAL_CORE.md +- Pure business logic patterns (no side effects) +- Core calculations (priorities, estimates, metrics) +- Validation logic (state transitions, constraints) +- Testing patterns for pure functions +- Property test examples + +Example: +```elixir +defmodule TaskManager.Impl.TaskLogic do + @moduledoc """ + Pure functions for task business logic. + No database access, no side effects. + """ + + @spec can_transition?(atom(), atom()) :: boolean() + def can_transition?(from_status, to_status) do + valid_transitions = %{ + todo: [:in_progress, :blocked], + in_progress: [:blocked, :review, :done], + blocked: [:todo, :in_progress], + review: [:in_progress, :done], + done: [] + } + + to_status in Map.get(valid_transitions, from_status, []) + end + + @spec calculate_priority_score(map()) :: integer() + def calculate_priority_score(task) do + base_score = priority_value(task.priority) + urgency_bonus = days_until_due(task.due_date) + dependency_factor = if task.has_blockers?, do: -10, else: 0 + + base_score + urgency_bonus + dependency_factor + end + + defp priority_value(:urgent), do: 100 + defp priority_value(:high), do: 75 + defp priority_value(:medium), do: 50 + defp priority_value(:low), do: 25 + + defp days_until_due(nil), do: 0 + defp days_until_due(due_date) do + diff = Date.diff(due_date, Date.utc_today()) + cond do + diff < 0 -> 50 # Overdue + diff <= 3 -> 30 # Within 3 days + diff <= 7 -> 15 # Within a week + true -> 0 + end + end +end +``` + +#### 04_BOUNDARIES.md +- Service orchestration layer +- Ecto.Multi patterns for atomic operations +- Transaction boundaries +- Error handling strategies +- Service composition patterns + +Example: +```elixir +defmodule TaskManager.Boundaries.TaskService do + alias Ecto.Multi + alias TaskManager.Impl.TaskLogic + + def transition_task(task_id, new_status, opts \\ []) do + Multi.new() + |> Multi.run(:load_task, fn _repo, _changes -> + case Ash.get(Task, task_id) do + {:ok, task} -> {:ok, task} + error -> error + end + end) + |> Multi.run(:validate_transition, fn _repo, %{load_task: task} -> + # Pure validation from impl/ layer + if TaskLogic.can_transition?(task.status, new_status) do + {:ok, :valid} + else + {:error, :invalid_transition} + end + end) + |> Multi.run(:update_task, fn _repo, %{load_task: task} -> + Task.transition_status(task, %{status: new_status}) + end) + |> Multi.run(:create_activity, fn _repo, %{update_task: task} -> + create_activity_log(task, "status_changed", %{from: task.status, to: new_status}) + end) + |> Multi.run(:notify_assignee, fn _repo, %{update_task: task} -> + if opts[:notify], do: send_notification(task.assignee_id, task) + {:ok, :notified} + end) + |> Multi.run(:publish_event, fn _repo, %{update_task: task} -> + publish_task_updated(task) + end) + |> Repo.transaction() + end +end +``` + +#### 05_LIFECYCLE.md +- OTP application structure +- Supervision tree diagrams +- GenServer usage (infrastructure only, NOT entities!) +- GenStage/Flow pipelines +- Telemetry setup +- Health checks + +Example supervisor: +```elixir +def start(_type, _args) do + children = [ + {TaskManager.Repo, []}, + {Phoenix.PubSub, name: TaskManager.PubSub}, + {TaskManager.Runtime.TaskCache, []}, + genstage_supervisor_spec(), + {Oban, Application.fetch_env!(:task_manager, Oban)} + ] + + opts = [strategy: :one_for_one, name: TaskManager.Supervisor] + Supervisor.start_link(children, opts) +end +``` + +#### 06_WORKERS.md +- Oban worker definitions +- Job queues and priorities +- Retry strategies +- Worker testing patterns +- Background job best practices + +Example: +```elixir +defmodule TaskManager.Workers.ReminderNotifier do + use Oban.Worker, + queue: :notifications, + max_attempts: 3, + priority: 2 + + @impl Oban.Worker + def perform(%Oban.Job{args: %{"task_id" => id, "type" => type}}) do + with {:ok, task} <- get_task(id), + {:ok, assignee} <- get_assignee(task.assignee_id), + :ok <- send_reminder(assignee, task, type) do + {:ok, :notified} + end + end + + defp send_reminder(assignee, task, "due_soon") do + # Send email/push notification + # Task is due within 24 hours + Notifications.send(assignee.email, "Task Due Soon", render_template(task)) + end + + defp send_reminder(assignee, task, "overdue") do + # Task is past due date + Notifications.send_urgent(assignee.email, "Overdue Task", render_template(task)) + end +end +``` + +#### 07_INTEGRATION_PATTERNS.md +- HTTP client patterns with Finch +- Circuit breaker implementation +- Retry logic with exponential backoff +- Webhook handling (incoming and outgoing) +- Event streaming with Broadway +- External service integration patterns + +Example: +```elixir +defmodule TaskManager.Integration.HTTPClient do + def request(method, url, body, opts \\ []) do + timeout = Keyword.get(opts, :timeout, 5_000) + retries = Keyword.get(opts, :retries, 3) + + request = build_request(method, url, body) + do_request_with_retry(request, timeout, retries) + end + + defp do_request_with_retry(request, timeout, retries_left, attempt \\ 1) do + case Finch.request(request, TaskManager.Finch, receive_timeout: timeout) do + {:ok, %{status: status}} when status in 200..299 -> + {:ok, decode_response(response)} + + {:ok, %{status: status}} when status in 500..599 and retries_left > 0 -> + backoff = calculate_backoff(attempt) + Process.sleep(backoff) + do_request_with_retry(request, timeout, retries_left - 1, attempt + 1) + + {:error, _} = error -> + error + end + end + + defp calculate_backoff(attempt) do + # Exponential backoff: 100ms, 200ms, 400ms, 800ms + trunc(:math.pow(2, attempt - 1) * 100) + end +end +``` + +### Phase 7: Architecture Decision Records + +Create ADRs for major decisions. Template: + +```markdown +# ADR-XXX: [Decision Title] + +**Status:** Accepted +**Date:** YYYY-MM-DD +**Deciders:** [Role] +**Context:** [Brief context] + +## Context +[Detailed explanation of the situation requiring a decision] + +## Decision +[Clear statement of what was decided] + +## Rationale +[Why this decision was made - include code examples, metrics, trade-offs] + +## Alternatives Considered + +### Alternative 1: [Name] +**Implementation:** +```elixir +# Example code +``` + +**Pros:** +- Advantage 1 +- Advantage 2 + +**Cons:** +- Disadvantage 1 +- Disadvantage 2 + +**Why Rejected:** [Clear explanation] + +### Alternative 2: [Name] +[Same structure] + +## Consequences + +### Positive +1. Benefit with explanation +2. Another benefit + +### Negative +1. Trade-off with mitigation strategy +2. Another trade-off + +## Implementation Guidelines + +### DO: [Pattern] +```elixir +# Good example +``` + +### DON'T: [Anti-pattern] +```elixir +# Bad example +``` + +## Validation +[How we'll verify this was the right choice] +- Metric 1: Target value +- Metric 2: Target value + +## References +- [Link 1] +- [Link 2] + +## Related ADRs +- ADR-XXX: Related Decision + +## Review Schedule +**Last Reviewed:** YYYY-MM-DD +**Next Review:** YYYY-MM-DD +``` + +**Minimum ADRs to create:** + +1. **ADR-001: Framework Choice** (Ash vs Plain Ecto vs Event Sourcing) +2. **ADR-002: ID Strategy** (ULID vs UUID vs Auto-increment vs Snowflake) +3. **ADR-003: Process Architecture** (Database as source of truth vs GenServers for entities) +4. **Domain-specific ADRs** based on requirements + +### Phase 8: Handoff Documentation + +Create HANDOFF.md with: + +1. **Overview** - Project status, location, ready state +2. **Project Structure** - Annotated directory tree +3. **Documentation Index** - What each file contains +4. **Workflow** - Director β†’ Implementor β†’ Review β†’ Iterate cycle +5. **Implementation Phases** - Break project into 4-week phases +6. **Key Architectural Principles** - DO/DON'T examples +7. **Testing Strategy** - Unit/Integration/Property test patterns +8. **Commit Message Format** - Conventional commits structure +9. **Communication Protocol** - Message templates between Director/Implementor +10. **Troubleshooting** - Common issues and solutions +11. **Success Metrics** - Specific performance targets +12. **Next Steps** - Immediate actions for Director AI + +Example workflow section: +```markdown +## Workflow + +### Phase 1: Director Creates Design & Plan +1. Read feature request from user +2. Review architecture documents +3. Create design document in `docs/design/` +4. Create implementation plan in `docs/plans/` (Superpowers format) +5. Commit design + plan +6. Hand off to Implementor with plan path + +### Phase 2: Implementor Executes Plan +1. Read implementation plan +2. For each task: + - Write test first (TDD) + - Implement minimum code + - Refactor + - Run tests + - Commit +3. Report completion to Director + +### Phase 3: Director Reviews +1. Review committed code +2. Check against design +3. Verify guardrails followed +4. Either approve or request changes + +### Phase 4: Iterate Until Approved +[Loop until feature is complete] +``` + +### Phase 9: Validate and Summarize + +Before finishing, verify: + +1. βœ… All directories created +2. βœ… 20+ documentation files present +3. βœ… All cross-references between docs work +4. βœ… All code examples are valid Elixir syntax +5. βœ… Every architectural principle has concrete example +6. βœ… ADRs include alternatives with rationale +7. βœ… Guardrails have DO/DON'T code examples +8. βœ… Domain-specific adaptations included + +Present summary: +```markdown +## Project Architecture Complete! πŸš€ + +**Location:** /path/to/project + +**Created:** +- βœ… Complete directory structure +- βœ… Foundation docs (README, CLAUDE.md) +- βœ… 5 guardrail documents +- βœ… 8 architecture documents (~6,000 lines) +- βœ… X Architecture Decision Records +- βœ… Handoff documentation + +**Ready For:** +- Director AI to create first design + plan +- Implementor AI to execute implementation +- Iterative feature development + +**Next Step:** +Director AI should begin by creating the first feature design. +``` + +## Domain-Specific Adaptations + +### For Task Management Systems + +Add emphasis on: + +1. **NEVER_DO.md** additions: + - Never allow invalid status transitions (enforce state machine) + - Never skip concurrency checks (optimistic locking for updates) + - Never store assignment history only in memory (persist for audit) + +2. **Domain Model** inclusions: + - Task state machine (todo β†’ in_progress β†’ review β†’ done) + - Priority calculation algorithms + - Dependency management (blocked tasks, prerequisites) + - Assignment and notification workflows + - Activity log for audit trail + +3. **ADRs** to add: + - State machine implementation (database constraints vs application logic) + - Priority scoring algorithm + - Real-time update strategy (PubSub vs polling) + - Notification delivery guarantees + +4. **Use Cases** examples: + - Create task and assign to user + - Transition task through workflow states + - Handle blocked tasks and dependencies + - Generate team velocity reports + +### For Financial Systems + +Add emphasis on: + +1. **NEVER_DO.md** additions: + - Never use floats for money (float precision errors) + - Never allow partial transaction commits (atomicity required) + - Never skip double-entry validation (balance integrity) + - Never update balances without version check (optimistic locking) + +2. **Domain Model** inclusions: + - Double-entry bookkeeping explanation and examples + - Balance calculation patterns (debits vs credits) + - Commission/fee calculation models + - Two-phase transaction workflow (pending β†’ approved/canceled) + - Immutable audit trail requirements + +3. **ADRs** to add: + - Money representation (integer cents vs Decimal) + - Transaction isolation levels + - Audit trail implementation strategy + - Optimistic vs pessimistic locking choice + +4. **Use Cases** examples: + - Account-to-account transfer + - Payment with commission split + - Voucher creation and redemption + - Balance reconciliation + +### For E-Commerce Systems + +Add emphasis on: + +1. **Domain Model** additions: + - Order state machine (cart β†’ placed β†’ paid β†’ fulfilled β†’ delivered) + - Inventory management and reservation + - Payment processing flow + - Refund and cancellation handling + +2. **Workers** to document: + - Order fulfillment worker + - Inventory synchronization worker + - Email notification worker + - Abandoned cart recovery worker + +3. **Integration Patterns**: + - Payment gateway integration (Stripe, PayPal) + - Shipping provider APIs + - Inventory management system sync + +### For SaaS Platforms + +Add emphasis on: + +1. **Domain Model** additions: + - Multi-tenancy strategy (shared schema with tenant_id vs separate schemas) + - Subscription and billing models + - Usage tracking and metering + - Feature flags and plan limits + +2. **Data Layer** considerations: + - Tenant isolation strategy (row-level security) + - Cross-tenant query prevention + - Data partitioning approach + +3. **Security**: + - Tenant context enforcement + - API authentication (JWT, API keys) + - Authorization patterns (role-based, attribute-based) + +## Critical Patterns and Best Practices + +### State Machine Validation + +```elixir +# βœ… ALWAYS validate state transitions +def transition_status(task, new_status) do + if TaskLogic.can_transition?(task.status, new_status) do + Task.update(task, %{status: new_status}) + else + {:error, :invalid_transition} + end +end + +# Define valid transitions +def can_transition?(from_status, to_status) do + valid_transitions = %{ + todo: [:in_progress, :blocked], + in_progress: [:blocked, :review, :done], + blocked: [:todo, :in_progress], + review: [:in_progress, :done], + done: [] + } + + to_status in Map.get(valid_transitions, from_status, []) +end +``` + +### Optimistic Locking + +```elixir +# βœ… ALWAYS check version for concurrent updates +def update_task(task_id, new_attrs) do + task = Repo.get!(Task, task_id) + + changeset = + task + |> change(new_attrs) + |> optimistic_lock(:version) + + case Repo.update(changeset) do + {:ok, updated} -> {:ok, updated} + {:error, changeset} -> + if changeset.errors[:version] do + {:error, :version_conflict} + else + {:error, changeset} + end + end +end +``` + +### GenServer Usage (Infrastructure Only!) + +```elixir +# ❌ DON'T: GenServer per entity +defmodule TaskServer do + use GenServer + # Storing task state in process - DON'T DO THIS +end + +# βœ… DO: GenServer for infrastructure +defmodule TaskCache do + use GenServer + # Caching active tasks (transient data, can rebuild from DB) +end + +defmodule RateLimiter do + use GenServer + # Tracking API request counts (acceptable to lose on crash) +end +``` + +### Ecto.Multi for Atomic Operations + +```elixir +# βœ… ALWAYS use Multi for multi-step operations +Multi.new() +|> Multi.insert(:task, task_changeset) +|> Multi.run(:assign, fn _repo, %{task: task} -> + create_assignment(task.id, assignee_id) +end) +|> Multi.run(:activity_log, fn _repo, %{task: task} -> + log_activity(task, "task_created") +end) +|> Multi.run(:publish_event, fn _repo, changes -> + publish_event(changes) +end) +|> Repo.transaction() +``` + +### Async External Calls + +```elixir +# ❌ NEVER block request path +def send_notification(task) do + HTTPClient.post("https://notifications.com/api", ...) # BLOCKS! +end + +# βœ… ALWAYS enqueue background job +def send_notification(task) do + %{task_id: task.id, type: "assignment"} + |> NotificationWorker.new() + |> Oban.insert() +end +``` + +## Common Mistakes to Avoid + +1. **Too Generic** - Always adapt to specific domain needs +2. **Missing Examples** - Every principle needs concrete code +3. **Unclear Boundaries** - Director vs Implementor roles must be explicit +4. **No Trade-offs** - Always explain downsides of decisions in ADRs +5. **Incomplete ADRs** - Must include alternatives considered and why rejected +6. **Vague Metrics** - Use specific numbers (<100ms, 1000 TPS, >90% coverage) +7. **Umbrella Apps** - Unless explicitly requested, use Dave Thomas structure +8. **GenServers for Entities** - Database is source of truth, not processes + +## Quality Gates + +Before considering work complete: + +- [ ] All code examples use valid Elixir syntax +- [ ] Every "NEVER DO" has a corresponding "ALWAYS DO" +- [ ] Every ADR explains alternatives and why they were rejected +- [ ] Domain model includes complete entity definitions with types +- [ ] Performance targets are specific and measurable +- [ ] Guardrails have clear, executable examples +- [ ] Communication protocol includes message templates +- [ ] Testing strategy covers unit/integration/property tests +- [ ] Integration patterns include retry/circuit breaker +- [ ] Supervision tree is documented with ASCII diagram + +## Success Criteria + +You've succeeded when: + +1. βœ… Director AI can create feature designs without asking architectural questions +2. βœ… Implementor AI can write code without asking design questions +3. βœ… All major decisions are documented with clear rationale +4. βœ… Code examples are copy-paste ready +5. βœ… Domain-specific requirements are thoroughly addressed +6. βœ… Performance targets are realistic and measurable +7. βœ… The system can be built by following the documentation alone + +## Notes + +- **Empty directories** (docs/design/, docs/plans/, docs/api/) are intentional - Director fills these during feature work +- **Superpowers format** for implementation plans: Markdown with YAML frontmatter, 2-5 minute tasks +- **All code examples** must be valid Elixir that could actually run +- **Consult experts** via Task agents - don't guess at best practices +- **Dave Thomas structure** preferred over umbrella apps unless user specifies otherwise +- **Database as source of truth** - avoid GenServers for domain entities (see ADR-003) diff --git a/components/skills/lang-rust-errors-dev/SKILL.md b/components/skills/lang-rust-errors-dev/SKILL.md new file mode 100644 index 0000000..69bed02 --- /dev/null +++ b/components/skills/lang-rust-errors-dev/SKILL.md @@ -0,0 +1,130 @@ +--- +name: handling-rust-errors +description: HASH error handling patterns using error-stack crate. Use when working with Result types, Report types, defining custom errors, propagating errors with change_context, adding context with attach, implementing Error trait, or documenting error conditions in Rust code. +license: AGPL-3.0 +metadata: + triggers: + type: domain + enforcement: suggest + priority: high + keywords: + - error + - Result + - Report + - error-stack + - change_context + - attach + - ResultExt + intent-patterns: + - "\\b(handle|create|define|propagate|convert)\\b.*?\\berror\\b" + - "\\bReport<.*>\\b" +--- + +# Rust Error-Stack Patterns + +HASH-specific error handling patterns using the `error-stack` crate for consistent, debuggable error handling across the Rust codebase. + +## Core Principles + +**HASH uses `error-stack` exclusively for error handling:** + +βœ… **DO:** + +- Use `Report` for all error types +- Use concrete error types: `Report` +- Import `Error` from `core::error::` (not `std::error::`) +- Import `ResultExt as _` for trait methods + +❌ **DON'T:** + +- Use `anyhow` or `eyre` crates +- Use `Box` (except in tests/prototyping) +- Use `Report>` +- Use `thiserror` (use `derive_more` instead) + +## HashQL Compiler Exception + +**HashQL compiler code uses a different error handling approach.** + +Code in `libs/@local/hashql/*` uses the `hashql-diagnostics` crate instead of `error-stack`. This is because compiler errors require rich formatting capabilities: + +- Source spans pointing to exact code locations +- Multiple labeled regions within the same diagnostic +- Fix suggestions with replacement text +- Severity levels (error, warning, hint) + +**Which approach to use:** + +| Location | Error Handling | +|-----------------------------------------|-----------------------------------------------------------------------------------------------------------| +| `libs/@local/hashql/*` (compiler code) | Use `hashql-diagnostics` β†’ See [writing-hashql-diagnostics](../writing-hashql-diagnostics/SKILL.md) skill | +| Everywhere else | Use `error-stack` patterns from this skill | + +Traditional `error-stack` patterns still apply for HashQL infrastructure code (CLI, file I/O, configuration) that doesn't involve compiler diagnostics. + +## Quick Start Guide + +Choose the reference that matches your current task: + +### Defining Errors + +**Use when:** Creating new error types or error enums + +- Define error types with `derive_more` +- Error enum patterns and variants +- Implement the `Error` trait +- Error type hierarchies + +### Propagating Errors + +**Use when:** Handling `Result` types, using `?` operator + +- Convert errors with `.change_context()` and `.change_context_with()` +- Add context with `.attach()` and `.attach_with()` +- Error conversion patterns + +### Documenting Errors + +**Use when:** Writing doc comments for fallible functions + +- `# Errors` section format +- Link error variants +- Document runtime errors +- Test error conditions + +## Common Quick Patterns + +### Creating an Error + +```rust +use error_stack::Report; + +return Err(Report::new(MyError::NotFound)) + .attach(format!("ID: {}", id)); +``` + +### Propagating with Context + +```rust +use error_stack::ResultExt as _; + +some_result + .change_context(MyError::OperationFailed) + .attach("Additional context")?; +``` + +### Lazy Context (for expensive operations) + +```rust +use error_stack::ResultExt as _; + +expensive_operation() + .change_context(MyError::OperationFailed) + .attach_with(|| format!("Debug info: {:?}", expensive_computation()))?; +``` + +## References + +- [Defining Errors](references/defining-errors.md) - Creating new error types or error enums +- [Propagating Errors](references/propagating-errors.md) - Handling `Result` types, using `?` operator +- [Documenting Errors](references/documenting-errors.md) - Writing doc comments for fallible functions diff --git a/components/skills/lang-rust-errors-dev/references/defining-errors.md b/components/skills/lang-rust-errors-dev/references/defining-errors.md new file mode 100644 index 0000000..82a42f0 --- /dev/null +++ b/components/skills/lang-rust-errors-dev/references/defining-errors.md @@ -0,0 +1,277 @@ +# Defining Errors + +This guide covers how to define custom error types in HASH using `error-stack` and `derive_more`. + +--- + +## Basic Error Type + +Use `derive_more` for the `Display` trait: + +```rust +use core::error::Error; + +#[derive(Debug, derive_more::Display)] +#[display("Operation failed: {_variant}")] +pub enum MyError { + #[display("Resource `{id}` not found")] + NotFound { id: String }, + + #[display("Operation timed out after {seconds}s")] + Timeout { seconds: u64 }, + + #[display("Invalid input: {reason}")] + InvalidInput { reason: String }, +} + +impl Error for MyError {} +``` + +**Key Points:** + +- Use `#[derive(Debug, derive_more::Display)]` +- Top-level `#[display("...")]` provides fallback message +- Per-variant `#[display("...")]` for specific messages +- Use `{_variant}` in top-level to show variant name +- Manually implement `Error` trait (just `impl Error for MyError {}`) +- Import from `core::error::Error`, NOT `std::error::Error` + +--- + +## Error Enum Patterns + +### Simple Variants + +```rust +#[derive(Debug, derive_more::Display)] +pub enum DatabaseError { + #[display("Connection failed")] + ConnectionFailed, + + #[display("Query timeout")] + Timeout, + + #[display("Record not found")] + NotFound, +} + +impl Error for DatabaseError {} +``` + +### Variants with Data + +```rust +#[derive(Debug, derive_more::Display)] +pub enum ValidationError { + #[display("Field `{field}` is required")] + MissingField { field: String }, + + #[display("Invalid format for `{field}`: expected {expected}")] + InvalidFormat { + field: String, + expected: String, + }, + + #[display("Value `{value}` out of range [{min}, {max}]")] + OutOfRange { + value: i64, + min: i64, + max: i64, + }, +} + +impl Error for ValidationError {} +``` + +### Variants with Wrapped Errors + +```rust +#[derive(Debug, derive_more::Display)] +pub enum ConfigError { + #[display("Failed to read config file")] + ReadFailed, + + #[display("Failed to parse config")] + ParseFailed, + + #[display("Missing required field: {field}")] + MissingField { field: String }, +} + +impl Error for ConfigError {} + +// Use error-stack to wrap the underlying errors +fn load_config(path: &Path) -> Result> { + let contents = std::fs::read_to_string(path) + .map_err(|e| Report::new(e)) + .change_context(ConfigError::ReadFailed)?; + + let config: Config = serde_json::from_str(&contents) + .map_err(|e| Report::new(e)) + .change_context(ConfigError::ParseFailed)?; + + Ok(config) +} +``` + +--- + +## Error Type Hierarchies + +For complex systems, create error hierarchies: + +```rust +// High-level service error +#[derive(Debug, derive_more::Display)] +pub enum ServiceError { + #[display("Database operation failed")] + Database, + + #[display("Validation failed")] + Validation, + + #[display("Authorization denied")] + Authorization, + + #[display("External service error")] + External, +} + +impl Error for ServiceError {} + +// Specific database errors +#[derive(Debug, derive_more::Display)] +pub enum DatabaseError { + #[display("Connection failed")] + ConnectionFailed, + + #[display("Query failed")] + QueryFailed, + + #[display("Transaction aborted")] + TransactionAborted, +} + +impl Error for DatabaseError {} + +// Convert specific to general +fn process() -> Result<(), Report> { + fetch_from_db() + .change_context(ServiceError::Database)?; + + validate_input() + .change_context(ServiceError::Validation)?; + + Ok(()) +} +``` + +--- + +## Common Patterns + +### Error with Source Information + +```rust +#[derive(Debug, derive_more::Display)] +pub enum FileError { + #[display("Failed to open file at `{path}`")] + OpenFailed { path: String }, + + #[display("Failed to read file at line {line}")] + ReadFailed { line: usize }, + + #[display("Invalid file format in `{path}`: {reason}")] + InvalidFormat { path: String, reason: String }, +} + +impl Error for FileError {} +``` + +### Error with Debug Context + +```rust +#[derive(Debug, derive_more::Display)] +pub enum QueryError { + #[display("Query compilation failed")] + CompilationFailed, + + #[display("Query execution failed")] + ExecutionFailed, + + #[display("Invalid query parameter: {param}")] + InvalidParameter { param: String }, +} + +impl Error for QueryError {} + +// Usage with context +fn execute_query(sql: &str) -> Result> { + let compiled = compile(sql) + .change_context(QueryError::CompilationFailed) + .attach_printable(format!("SQL: {}", sql))?; + + run(compiled) + .change_context(QueryError::ExecutionFailed) + .attach_printable(format!("Compiled query: {:?}", compiled))?; + + // ... +} +``` + +--- + +## Best Practices + +### DO: + +βœ… Use descriptive variant names +βœ… Include relevant context in variant fields +βœ… Use `core::error::Error` instead of `std::error::Error` +βœ… Keep error messages user-friendly but informative +βœ… Use structured data (fields) instead of formatted strings + +### DON'T: + +❌ Use `thiserror` (use `derive_more` instead) +❌ Use `Box` in error variants +❌ Include sensitive data in error messages +❌ Make error messages too technical for end users +❌ Create overly generic error types + +--- + +## Testing Error Types + +```rust +#[test] +fn error_display_format() { + let error = MyError::NotFound { + id: "user_123".to_string(), + }; + + assert_eq!( + error.to_string(), + "Resource `user_123` not found", + "Error message should match expected format" + ); +} + +#[test] +fn error_with_report() { + let report = Report::new(MyError::Timeout { seconds: 30 }) + .attach_printable("During database query"); + + assert!(matches!( + report.current_context(), + MyError::Timeout { seconds: 30 } + )); +} +``` + +--- + +## Related References + +- [Propagating Errors](./propagating-errors.md) - Handle and propagate these errors +- [Documenting Errors](./documenting-errors.md) - Document these in functions diff --git a/components/skills/lang-rust-errors-dev/references/documenting-errors.md b/components/skills/lang-rust-errors-dev/references/documenting-errors.md new file mode 100644 index 0000000..ad2ebf5 --- /dev/null +++ b/components/skills/lang-rust-errors-dev/references/documenting-errors.md @@ -0,0 +1,349 @@ +# Documenting Errors + +This guide covers how to document error conditions in HASH Rust code. + +--- + +## Error Documentation Format + +All fallible functions must document their errors with an `# Errors` section. + +### Basic Format + +```rust +/// Creates a new web in the system. +/// +/// Registers a new web with the given parameters and ensures uniqueness. +/// +/// # Errors +/// +/// - [`WebAlreadyExists`] if a web with the same ID already exists +/// - [`AuthorizationError`] if the account lacks permission +/// - [`DatabaseError`] if the operation fails at the database level +/// +/// [`WebAlreadyExists`]: WebError::WebAlreadyExists +/// [`AuthorizationError`]: WebError::Authorization +/// [`DatabaseError`]: WebError::Database +pub fn create_web(&mut self) -> Result> { + // Implementation +} +``` + +**Key Elements:** + +- `# Errors` section header +- Bullet point for each error variant +- Intra-doc links using `` [`VariantName`] `` syntax +- Link definitions at the bottom + +--- + +## Linking Error Variants + +### Same Module Errors + +```rust +#[derive(Debug, derive_more::Display)] +pub enum UserError { + #[display("User not found")] + NotFound, + + #[display("Unauthorized access")] + Unauthorized, +} + +impl Error for UserError {} + +/// Fetches a user by ID. +/// +/// # Errors +/// +/// - [`NotFound`] if the user doesn't exist +/// - [`Unauthorized`] if the caller lacks permission +/// +/// [`NotFound`]: UserError::NotFound +/// [`Unauthorized`]: UserError::Unauthorized +pub fn fetch_user(id: &str) -> Result> { + // Implementation +} +``` + +### Cross-Module Errors + +```rust +/// Validates user input. +/// +/// # Errors +/// +/// - [`ValidationError::EmptyInput`] if the input is empty +/// - [`ValidationError::TooLong`] if the input exceeds max length +/// +/// [`ValidationError::EmptyInput`]: crate::validation::ValidationError::EmptyInput +/// [`ValidationError::TooLong`]: crate::validation::ValidationError::TooLong +pub fn validate_input(input: &str) -> Result<(), Report> { + // Implementation +} +``` + +--- + +## Runtime/Dynamic Errors + +For errors created dynamically (not enum variants): + +```rust +/// Validates that all input values are unique. +/// +/// # Errors +/// +/// Returns a validation error if the input contains duplicate values +pub fn validate_unique(values: &[String]) -> Result<(), Report> { + for (i, value) in values.iter().enumerate() { + if values[i + 1..].contains(value) { + return Err(Report::new(ValidationError::DuplicateValue)) + .attach(format!("Duplicate: {}", value)); + } + } + Ok(()) +} +``` + +**Note:** No intra-doc links needed for dynamically created errors - just describe the condition. + +--- + +## Multiple Error Sources + +When a function can fail for many reasons: + +```rust +/// Processes a configuration file. +/// +/// Reads the file from disk, parses it, and validates the contents. +/// +/// # Errors +/// +/// - [`ReadFailed`] if the file cannot be read +/// - [`ParseFailed`] if the file contains invalid syntax +/// - [`ValidationFailed`] if the configuration is semantically invalid +/// - Returns an error if any required field is missing +/// +/// [`ReadFailed`]: ConfigError::ReadFailed +/// [`ParseFailed`]: ConfigError::ParseFailed +/// [`ValidationFailed`]: ConfigError::ValidationFailed +pub fn process_config(path: &Path) -> Result> { + // Implementation +} +``` + +--- + +## Async Function Errors + +Document the same way as sync functions: + +```rust +/// Fetches user data from the database. +/// +/// # Errors +/// +/// - [`ConnectionFailed`] if the database connection is unavailable +/// - [`QueryFailed`] if the SQL query fails +/// - [`NotFound`] if no user with the given ID exists +/// +/// [`ConnectionFailed`]: DatabaseError::ConnectionFailed +/// [`QueryFailed`]: DatabaseError::QueryFailed +/// [`NotFound`]: DatabaseError::NotFound +pub async fn fetch_user_async(id: i64) -> Result> { + // Implementation +} +``` + +--- + +## Testing Error Conditions + +Write tests for each documented error case: + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn user_not_found_returns_error() { + let result = fetch_user("nonexistent_id"); + + let err = result.expect_err("should return error for nonexistent user"); + + // Check the error type + assert!( + matches!( + err.current_context(), + UserError::NotFound + ), + "should return NotFound error" + ); + } + + #[test] + fn unauthorized_access_returns_error() { + let result = fetch_user_without_permission("user_123"); + + let err = result.expect_err("should return error for unauthorized access"); + + assert!( + matches!( + err.current_context(), + UserError::Unauthorized + ), + "should return Unauthorized error" + ); + } + + #[test] + fn successful_fetch() { + let result = fetch_user("valid_user_id"); + + result.expect("should successfully fetch existing user"); + } +} +``` + +**Key Points:** + +- Test **every** error variant mentioned in docs +- Use `.expect_err("should...")` format +- Assert on specific error types with `matches!` +- Include success case tests too + +--- + +## Examples in Documentation + +When writing `# Examples` sections for fallible functions: + +### Prefer `?` Operator + +Use `?` for error propagation in examples whenever possible: + +```rust +/// Fetches and processes user data. +/// +/// # Examples +/// +/// ``` +/// # use myapp::{fetch_user, UserError}; +/// # use error_stack::Report; +/// let user = fetch_user("user_123")?; +/// println!("User: {}", user.name); +/// # Ok::<_, Box>(()) +/// ``` +pub fn fetch_user(id: &str) -> Result> { + // Implementation +} +``` + +**Key Points:** + +- Use `?` instead of `.unwrap()` or `match` +- Add `# Ok::<_, Box>(())` at the end +- This makes examples more realistic and idiomatic + +### When NOT to Use `?` + +Only use explicit error handling when demonstrating error handling itself: + +```rust +/// Validates user input. +/// +/// # Examples +/// +/// ``` +/// # use myapp::{validate_input, ValidationError}; +/// match validate_input("test") { +/// Ok(()) => println!("Valid"), +/// Err(e) => eprintln!("Invalid: {}", e), +/// } +/// ``` +pub fn validate_input(input: &str) -> Result<(), Report> { + // Implementation +} +``` + +--- + +## Best Practices + +### DO: + +βœ… Document ALL error cases in fallible functions +βœ… Use intra-doc links for error variants +βœ… Be specific about error conditions +βœ… Test each documented error case +βœ… Update docs when adding new error variants +βœ… Link to error enum documentation when relevant + +### DON'T: + +❌ Skip error documentation ("obvious" cases still need docs) +❌ Use plain text without intra-doc links +❌ Document only some error variants +❌ Write vague error descriptions ("may fail") +❌ Forget to update tests when docs change + +--- + +## Examples + +### Complete Function Documentation + +```rust +#[derive(Debug, derive_more::Display)] +pub enum RegistrationError { + #[display("Email already registered")] + EmailTaken, + + #[display("Invalid email format")] + InvalidEmail, + + #[display("Password too weak")] + WeakPassword, +} + +impl Error for RegistrationError {} + +/// Registers a new user in the system. +/// +/// Creates a new user account with the provided email and password. +/// The email must be unique and the password must meet security requirements. +/// +/// # Errors +/// +/// - [`EmailTaken`] if another user is already registered with this email +/// - [`InvalidEmail`] if the email format is invalid +/// - [`WeakPassword`] if the password doesn't meet security requirements +/// +/// [`EmailTaken`]: RegistrationError::EmailTaken +/// [`InvalidEmail`]: RegistrationError::InvalidEmail +/// [`WeakPassword`]: RegistrationError::WeakPassword +/// +/// # Examples +/// +/// ``` +/// # use myapp::{register_user, RegistrationError}; +/// # use error_stack::Report; +/// let user_id = register_user("user@example.com", "SecurePass123!")?; +/// # Ok::<_, Box>(()) +/// ``` +pub fn register_user(email: &str, password: &str) -> Result> { + // Implementation +} +``` + +--- + +## Related References + +- [Defining Errors](./defining-errors.md) - Create error types +- [Propagating Errors](./propagating-errors.md) - Add context and convert errors diff --git a/components/skills/lang-rust-errors-dev/references/propagating-errors.md b/components/skills/lang-rust-errors-dev/references/propagating-errors.md new file mode 100644 index 0000000..dd195f7 --- /dev/null +++ b/components/skills/lang-rust-errors-dev/references/propagating-errors.md @@ -0,0 +1,312 @@ +# Propagating Errors + +This guide covers how to propagate errors through your code using `error-stack`. + +--- + +## Basic Error Propagation + +### Using the `?` Operator + +```rust +use error_stack::{Report, ResultExt as _}; + +fn process_data(id: &str) -> Result> { + // Direct propagation - error types match + let raw = fetch_raw_data(id)?; + + // Convert and propagate + let processed = transform_data(raw) + .change_context(MyError::TransformFailed)?; + + Ok(processed) +} +``` + +**Note:** Import `ResultExt as _` to bring trait methods into scope without polluting namespace. + +--- + +## Converting Error Types + +### Using `.change_context()` + +Convert one error type to another: + +```rust +use error_stack::ResultExt as _; + +fn load_user(id: &str) -> Result> { + // Convert DatabaseError β†’ UserError + let data = db::fetch(id) + .change_context(UserError::DatabaseFailed)?; + + // Convert ParseError β†’ UserError + let user = parse_user_data(data) + .change_context(UserError::ParseFailed)?; + + Ok(user) +} +``` + +--- + +## Adding Context + +### Using `.attach()` + +Add debugging information without changing error type: + +```rust +use error_stack::ResultExt as _; + +fn process_batch(items: &[Item]) -> Result<(), Report> { + for (idx, item) in items.iter().enumerate() { + process_item(item) + .attach(format!("Failed at index {}", idx)) + .attach(format!("Item ID: {}", item.id))?; + } + + Ok(()) +} +``` + +### Combining Context and Conversion + +```rust +use error_stack::ResultExt as _; + +fn update_user(id: &str, data: UserData) -> Result> { + let existing = fetch_user(id) + .change_context(UserError::FetchFailed) + .attach(format!("User ID: {}", id))?; + + let updated = apply_updates(existing, data) + .change_context(UserError::UpdateFailed) + .attach(format!("Updates: {:?}", data))?; + + save_user(&updated) + .change_context(UserError::SaveFailed) + .attach(format!("User: {:?}", updated.id))?; + + Ok(updated) +} +``` + +--- + +## Lazy Context Attachment + +For expensive computations, use `_with` variants to defer evaluation: + +### Using `.attach_with()` + +```rust +use error_stack::ResultExt as _; + +fn process_large_data(data: &LargeData) -> Result<(), Report> { + expensive_operation(data) + .change_context(ProcessError::OperationFailed) + // Only compute debug string if error occurs + .attach_with(|| format!("Data summary: {:?}", data.compute_summary()))?; + + Ok(()) +} +``` + +### Using `.change_context_with()` + +When error creation itself is expensive: + +```rust +use error_stack::ResultExt as _; + +fn process_with_expensive_error(item: &Item) -> Result<(), Report> { + operation(item) + // Error variant creation might involve computation + .change_context_with(|| ComplexError::from_item_analysis(item)) + .attach_with(|| format!("Item state: {:?}", item.expensive_debug()))?; + + Ok(()) +} +``` + +**Rule of thumb:** Use `_with` variants only when the closure does non-trivial work. + +--- + +## Async Error Propagation + +Error propagation works the same in async code: + +```rust +use error_stack::ResultExt as _; + +async fn fetch_and_process(id: String) -> Result> { + // Propagate async errors + let raw = fetch_async(&id) + .await + .change_context(ProcessError::FetchFailed) + .attach(format!("ID: {}", id))?; + + // Mix sync and async operations + let validated = validate_data(&raw) + .change_context(ProcessError::ValidationFailed)?; + + let processed = process_async(validated) + .await + .change_context(ProcessError::ProcessingFailed)?; + + Ok(processed) +} +``` + +**Important:** The `.change_context()` call can appear before `.await` because `ResultExt` is in scope: + +```rust +use error_stack::{FutureExt as _, ResultExt as _}; + +// βœ… This works - `FutureExt` trait is in scope +let result = async_operation() + .change_context(MyError::Failed) + .await?; + +// βœ… Also correct - context added after await using `ResultExt` +let result = async_operation() + .await + .change_context(MyError::Failed)?; +``` + +--- + +## Converting External Errors + +### Standard Library Errors + +```rust +use error_stack::{Report, ResultExt as _}; + +fn read_file(path: &Path) -> Result> { + // Convert std::io::Error + let contents = std::fs::read_to_string(path) + .map_err(Report::new) + .change_context(FileError::ReadFailed) + .attach(format!("Path: {}", path.display()))?; + + Ok(contents) +} +``` + +### Third-Party Library Errors + +```rust +use error_stack::{Report, ResultExt as _}; + +fn parse_json(json: &str) -> Result> { + // Convert serde_json::Error + let value: Value = serde_json::from_str(json) + .map_err(Report::new) + .change_context(ParseError::JsonParseFailed) + .attach(format!("JSON length: {}", json.len()))?; + + Ok(value) +} +``` + +--- + +## Error Chains + +Build error chains for complex operations: + +```rust +use error_stack::ResultExt as _; + +fn complex_operation(id: &str) -> Result> { + // Each step adds to the error chain + let data = fetch_data(id) + .change_context(ServiceError::FetchFailed) + .attach(format!("Step 1: fetch data for {}", id))?; + + let validated = validate(data) + .change_context(ServiceError::ValidationFailed) + .attach("Step 2: validation")?; + + let transformed = transform(validated) + .change_context(ServiceError::TransformFailed) + .attach("Step 3: transformation")?; + + let result = save(transformed) + .change_context(ServiceError::SaveFailed) + .attach("Step 4: save result")?; + + Ok(result) +} +``` + +--- + +## Best Practices + +### DO: + +βœ… Always add context when propagating errors +βœ… Use `.change_context()` to convert error types at boundaries +βœ… Include relevant IDs, indices, or state in attachments +βœ… Use `_with` variants for non-trivial closures +βœ… Import `ResultExt as _` to avoid namespace pollution +βœ… Add context close to where the error occurs + +### DON'T: + +❌ Propagate errors without context +❌ Add too much context (avoid duplicates) +❌ Include sensitive data in attachments +❌ Use `unwrap()` or `expect()` in production code +❌ Silently ignore errors with `let _ = ...` +❌ Use `_with` variants for trivial operations + +--- + +## Common Patterns + +### Option to Result Conversion + +```rust +use error_stack::Report; + +fn get_user_by_id(id: &str, users: &HashMap) -> Result<&User, Report> { + users + .get(id) + .ok_or_else(|| Report::new(UserError::NotFound)) + .attach(format!("User ID: {}", id)) +} +``` + +### Multiple Error Sources + +```rust +use error_stack::ResultExt as _; + +fn process_config(path: &Path) -> Result> { + let raw = std::fs::read_to_string(path) + .map_err(Report::new) + .change_context(ConfigError::ReadFailed)?; + + let parsed: RawConfig = toml::from_str(&raw) + .map_err(Report::new) + .change_context(ConfigError::ParseFailed)?; + + validate_config(&parsed) + .change_context(ConfigError::ValidationFailed)?; + + Ok(build_config(parsed)) +} +``` + +--- + +## Related References + +- [Defining Errors](./defining-errors.md) - Create error types +- [Documenting Errors](./documenting-errors.md) - Document error conditions