diff --git a/examples/notebooks/retail_data_generation.py b/examples/notebooks/retail_data_generation.py new file mode 100644 index 00000000..62b592de --- /dev/null +++ b/examples/notebooks/retail_data_generation.py @@ -0,0 +1,692 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # CPG Supply Chain Dummy Data Generator +# MAGIC +# MAGIC ## Educational Guide to dbldatagen +# MAGIC +# MAGIC This notebook demonstrates how to use [**dbldatagen**](https://databrickslabs.github.io/dbldatagen/public_docs/index.html) to simulate data from a supply chain for consumer packaged goods (CPG). +# MAGIC +# MAGIC +# MAGIC ### Datasets We'll Create: +# MAGIC 1. **Products** - SKU master data with categories and pricing +# MAGIC 2. **Distribution Centers** - Network locations with capacity +# MAGIC 3. **Retail Stores** - Customer-facing locations +# MAGIC 4. **Orders** - Manufacturing execution data +# MAGIC 5. **Inventory Snapshots** - Multi-echelon inventory with risk metrics +# MAGIC 6. **Shipments** - Transportation and logistics data + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Installation & Setup +# MAGIC +# MAGIC dbldatagen can be installed using pip install commands, as a cluster-scoped library, or as a serverless environment-scoped library. + +# COMMAND ---------- + +# MAGIC %pip install dbldatagen + +# COMMAND ---------- + +import dbldatagen as dg +from pyspark.sql.types import * +from pyspark.sql import functions as F +from datetime import datetime, timedelta + +print(f"Using dbldatagen version: {dg.__version__}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Configuration +# MAGIC +# MAGIC **Best Practice**: Define all configuration parameters at the top for easy adjustment. + +# COMMAND ---------- + +# Data generation parameters - adjust these to scale up/down +NUM_PRODUCTS = 500 +NUM_DISTRIBUTION_CENTERS = 25 +NUM_STORES = 1000 +NUM_ORDERS = 10000 +NUM_INVENTORY_RECORDS = 50000 +NUM_SHIPMENTS = 30000 + +# Catalog configuration +CATALOG_NAME = 'CATALOG_NAME' +SCHEMA_NAME = 'SCHEMA_NAME' + +# Set up the Catalog +spark.sql(f"USE CATALOG {CATALOG_NAME}") +spark.sql(f"USE SCHEMA {SCHEMA_NAME}") + +print(f"Generating data in: {CATALOG_NAME}.{SCHEMA_NAME}") +print(f"Total records to generate: {NUM_PRODUCTS + NUM_DISTRIBUTION_CENTERS + NUM_STORES + NUM_ORDERS + NUM_INVENTORY_RECORDS + NUM_SHIPMENTS:,}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 1. Product Master Data +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - How to use `withIdOutput()` for unique IDs +# MAGIC - Creating string expressions with `concat()` and `lpad()` +# MAGIC - Using `values` parameter for categorical data +# MAGIC - Working with different data types (string, decimal, integer, date) +# MAGIC +# MAGIC ### Key Concepts: +# MAGIC - **uniqueValues**: Ensures the column has exactly N unique values +# MAGIC - **template**: Generates random words (\\w pattern) +# MAGIC - **minValue/maxValue**: Range for numeric values +# MAGIC - **begin/end**: Date range parameters + +# COMMAND ---------- + +# Define categorical values for products +product_categories = ["Beverages", "Snacks", "Dairy", "Bakery", "Frozen Foods", "Personal Care", "Household"] +brands = ["Premium Brand A", "Value Brand B", "Store Brand C", "Organic Brand D", "Brand E"] + +# Build the data generator specification +products_spec = ( + dg.DataGenerator(spark, name="products", rows=NUM_PRODUCTS, partitions=4) + + # withIdOutput() creates an 'id' column with sequential integers starting at 1 + .withIdOutput() + + # Create SKU codes: SKU-000001, SKU-000002, etc. + # expr allows SQL expressions; cast(id as string) converts the id to string + # lpad pads to 6 digits; uniqueValues ensures exactly NUM_PRODUCTS unique SKUs + .withColumn("sku", "string", + expr="concat('SKU-', lpad(cast(id as string), 6, '0'))", + uniqueValues=NUM_PRODUCTS) + + # template uses \\w to generate random words + .withColumn("product_name", "string", template=r"\\w \\w Product") + + # values with random=True picks randomly from the list + .withColumn("category", "string", values=product_categories, random=True) + .withColumn("brand", "string", values=brands, random=True) + + # Numeric ranges for costs and pricing + .withColumn("unit_cost", "decimal(10,2)", minValue=0.5, maxValue=50.0, random=True) + .withColumn("unit_price", "decimal(10,2)", minValue=1.0, maxValue=100.0, random=True) + + # Pick from specific values (case sizes) + .withColumn("units_per_case", "integer", values=[6, 12, 24, 48], random=True) + .withColumn("weight_kg", "decimal(8,2)", minValue=0.1, maxValue=25.0, random=True) + .withColumn("shelf_life_days", "integer", minValue=30, maxValue=730, random=True) + + # Date range for when products were created + .withColumn("created_date", "date", begin="2020-01-01", + end="2024-01-01", interval="1 day", random=True ) +) + +# Build the dataframe from the specification +df_products = products_spec.build() + +# Write to table +df_products.write.mode("overwrite").saveAsTable("products") + +display(df_products.limit(10)) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 2. Distribution Centers +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - Creating location codes with expressions +# MAGIC - Generating geographic coordinates (latitude/longitude) +# MAGIC - Using realistic ranges for capacity and utilization metrics +# MAGIC +# MAGIC ### Pro Tip: +# MAGIC When generating geographic data, use realistic ranges: +# MAGIC - US Latitude: 25.0 to 49.0 (southern border to Canadian border) +# MAGIC - US Longitude: -125.0 to -65.0 (west coast to east coast) + +# COMMAND ---------- + +distribution_center_spec = ( + dg.DataGenerator(spark, name="distribution_centers", rows=NUM_DISTRIBUTION_CENTERS, partitions=4) + .withIdOutput() + + # distribution_center codes: distribution_center-0001, distribution_center-0002, etc. + .withColumn("distribution_center_code", "string", + expr="concat('distribution_center-', lpad(cast(id as string), 4, '0'))", + uniqueValues=NUM_DISTRIBUTION_CENTERS) + + .withColumn("distribution_center_name", "string", template=r"\\w Distribution Center") + + # Regional distribution for US + .withColumn("region", "string", + values=["Northeast", "Southeast", "Midwest", "Southwest", "West"], + random=True) + + # Warehouse capacity metrics + .withColumn("capacity_pallets", "integer", minValue=5000, maxValue=50000, random=True) + .withColumn("current_utilization_pct", "decimal(5,2)", minValue=45.0, maxValue=95.0, random=True) + + # Geographic coordinates for mapping + .withColumn("latitude", "decimal(9,6)", minValue=25.0, maxValue=49.0, random=True) + .withColumn("longitude", "decimal(9,6)", minValue=-125.0, maxValue=-65.0, random=True) + + # Operating costs + .withColumn("operating_cost_daily", "decimal(10,2)", minValue=5000, maxValue=50000, random=True) + .withColumn("opened_date", "date", begin="2015-01-01", end="2023-01-01", random=True) +) + +df_distribution_centers = distribution_center_spec.build() +df_distribution_centers.write.mode("overwrite").saveAsTable("distribution_centers") + +print(f"Created distribution_centers table with {df_distribution_centers.count():,} records") +display(df_distribution_centers.limit(10)) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 3. Retail Stores +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - Creating foreign key relationships (distribution_center_id references distribution_centers) +# MAGIC - Generating realistic store attributes +# MAGIC - Using longer store codes (6 digits vs 4 for distribution_centers) + +# COMMAND ---------- + +store_formats = ["Hypermarket", "Supermarket", "Convenience", "Online", "Club Store"] +retailers = ["RetailCo", "MegaMart", "QuickStop", "FreshGrocer", "ValueMart"] + +stores_spec = ( + dg.DataGenerator(spark, name="stores", rows=NUM_STORES, partitions=8) + .withIdOutput() + + # Store codes: STORE-000001, STORE-000002, etc. + .withColumn("store_code", "string", + expr="concat('STORE-', lpad(cast(id as string), 6, '0'))", + uniqueValues=NUM_STORES) + + .withColumn("retailer", "string", values=retailers, random=True) + .withColumn("store_format", "string", values=store_formats, random=True) + .withColumn("region", "string", + values=["Northeast", "Southeast", "Midwest", "Southwest", "West"], + random=True) + + # Store size range from small convenience to large hypermarket + .withColumn("square_footage", "integer", minValue=2000, maxValue=200000, random=True) + + # FOREIGN KEY: Links to distribution_centers table + # Each store gets a distribution_center ID between 1 and NUM_DISTRIBUTION_CENTERS + .withColumn("distribution_center_id", "integer", minValue=1, maxValue=NUM_DISTRIBUTION_CENTERS, random=True) + + .withColumn("latitude", "decimal(9,6)", minValue=25.0, maxValue=49.0, random=True) + .withColumn("longitude", "decimal(9,6)", minValue=-125.0, maxValue=-65.0, random=True) + .withColumn("opened_date", "date", begin="2010-01-01", end="2024-01-01", random=True) +) + +df_stores = stores_spec.build() +df_stores.write.mode("overwrite").saveAsTable("stores") + +print(f"Created stores table with {df_stores.count():,} records") +print(f"Each store is linked to a distribution_center via distribution_center_id foreign key") +display(df_stores.limit(10)) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 4. Orders +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - Working with **timestamp** columns +# MAGIC - Using intermediate random columns for complex calculations +# MAGIC - Post-processing with PySpark transformations +# MAGIC - Using modulo operations for distributing categorical values +# MAGIC +# MAGIC ### Advanced Pattern: +# MAGIC When you need complex logic that depends on random values: +# MAGIC 1. Generate random "seed" columns in the spec +# MAGIC 2. Build the dataframe +# MAGIC 3. Use PySpark `.withColumn()` to create derived columns +# MAGIC 4. Drop the intermediate seed columns + +# COMMAND ---------- + +order_status = ["Scheduled", "In Progress", "Completed", "Delayed", "Quality Hold"] + +order_spec = ( + dg.DataGenerator(spark, name="orders", rows=NUM_ORDERS, partitions=8) + .withIdOutput() + + .withColumn("order_number", "string", + expr="concat('PO-', lpad(cast(id as string), 8, '0'))", + uniqueValues=NUM_ORDERS) + + # FOREIGN KEYS + .withColumn("distribution_center_id", "integer", minValue=1, maxValue=NUM_DISTRIBUTION_CENTERS, random=True) + .withColumn("product_id", "integer", minValue=1, maxValue=NUM_PRODUCTS, random=True) + + # Base timestamp for the order + .withColumn("order_date", "timestamp", + begin="2024-01-01 00:00:00", + end="2025-09-29 23:59:59", + random=True) + + # Random seed columns for calculations (will be used then dropped) + .withColumn("scheduled_start_days", "integer", minValue=0, maxValue=10, random=True) + .withColumn("scheduled_duration_days", "integer", minValue=1, maxValue=6, random=True) + .withColumn("start_delay_hours", "integer", minValue=-12, maxValue=12, random=True) + .withColumn("actual_duration_hours", "integer", minValue=24, maxValue=144, random=True) + .withColumn("start_probability", "double", minValue=0, maxValue=1, random=True) + .withColumn("completion_probability", "double", minValue=0, maxValue=1, random=True) + .withColumn("quantity_ordered", "integer", minValue=500, maxValue=50000, random=True) + .withColumn("order_variance", "double", minValue=0.85, maxValue=1.0, random=True) + + # Use modulo to distribute status values evenly + # status_rand % 5 gives values 0-4, which we'll map to our 5 status values + .withColumn("status_rand", "integer", minValue=1, maxValue=10000, random=True) + + .withColumn("line_efficiency_pct", "decimal(5,2)", minValue=75.0, maxValue=98.0, random=True) + .withColumn("production_cost", "decimal(12,2)", minValue=5000, maxValue=500000, random=True) +) + +# Build the base dataframe +df_orders = order_spec.build() + +# POST-PROCESSING: Add calculated columns using PySpark +df_orders = ( + df_orders + # Calculate scheduled start by adding days to order_date + .withColumn("scheduled_start", + F.expr("date_add(order_date, scheduled_start_days)")) + + # Calculate scheduled end + .withColumn("scheduled_end", + F.expr("date_add(scheduled_start, scheduled_duration_days)")) + + # Actual start: only if probability > 0.3, add delay hours + .withColumn("actual_start", + F.when(F.col("start_probability") > 0.3, + F.expr("timestampadd(HOUR, start_delay_hours, scheduled_start)")) + .otherwise(None)) + + # Actual end: only if started AND probability > 0.2 + .withColumn("actual_end", + F.when((F.col("actual_start").isNotNull()) & + (F.col("completion_probability") > 0.2), + F.expr("timestampadd(HOUR, actual_duration_hours, actual_start)")) + .otherwise(None)) + + # Quantity produced: apply variance if completed + .withColumn("quantity_produced", + F.when(F.col("actual_end").isNotNull(), + (F.col("quantity_ordered") * F.col("order_variance")).cast("integer")) + .otherwise(0)) + + # Map status_rand to status using modulo and array indexing + .withColumn("status_index", F.col("status_rand") % 5) + .withColumn("status", + F.array([F.lit(s) for s in order_status]).getItem(F.col("status_index"))) + + # Clean up: drop intermediate columns + .drop("scheduled_start_days", "scheduled_duration_days", "start_delay_hours", + "actual_duration_hours", "start_probability", "completion_probability", + "order_variance", "status_rand", "status_index") +) + +df_orders.write.mode("overwrite").saveAsTable("orders") + +print(f"Created orders table with {df_orders.count():,} records") +print(f"Order Status distribution:") +df_orders.groupBy("status").count().orderBy("status").show() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 5. Inventory Snapshots +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - Using **CASE expressions** in SQL for conditional logic +# MAGIC - Creating weighted distributions with seed columns +# MAGIC - Handling division by zero with conditional logic +# MAGIC - Post-processing for complex foreign key relationships +# MAGIC +# MAGIC ### Pattern: Weighted Categorical Distribution +# MAGIC To get 30% distribution_center and 70% Store: +# MAGIC 1. Create a seed column with values 0-1 +# MAGIC 2. Use CASE: when seed < 0.3 then 'distribution_center' else 'Store' +# MAGIC +# MAGIC ### Pattern: Safe Division +# MAGIC Always check denominator before dividing to avoid errors + +# COMMAND ---------- + +inventory_spec = ( + dg.DataGenerator(spark, name="inventory", rows=NUM_INVENTORY_RECORDS, partitions=8) + .withIdOutput() + + # Date range for inventory snapshots + .withColumn("snapshot_date", "date", + begin="2024-01-01", + end="2025-09-29", + random=True) + + # Weighted distribution: 30% distribution_center, 70% Store + .withColumn("location_type_seed", "double", minValue=0, maxValue=1, random=True) + .withColumn("location_type", "string", expr=""" + CASE + WHEN location_type_seed < 0.3 THEN 'distribution_center' + ELSE 'Store' + END + """) + + # Create location_id based on location_type using expr + .withColumn("location_id", "integer", expr=""" + CASE + WHEN location_type = 'distribution_center' THEN (id % 25) + 1 + ELSE (id % 1000) + 1 + END + """) + + # FOREIGN KEY + .withColumn("product_id", "integer", minValue=1, maxValue=NUM_PRODUCTS, random=True) + + # Inventory quantities + .withColumn("quantity_on_hand", "integer", minValue=0, maxValue=10000, random=True) + .withColumn("reserve_factor", "double", minValue=0, maxValue=0.5, random=True) + + # Calculate reserved quantity using expr + .withColumn("quantity_reserved", "integer", expr="cast(quantity_on_hand * reserve_factor as int)") + + # Calculate available quantity + .withColumn("quantity_available", "integer", expr="quantity_on_hand - quantity_reserved") + + .withColumn("reorder_point", "integer", minValue=100, maxValue=2000, random=True) + + # Demand rate for calculations + .withColumn("daily_demand", "double", minValue=50.0, maxValue=150.0, random=True) + + # Calculate days of supply with safe division + .withColumn("days_of_supply", "decimal(8,2)", expr=""" + CASE + WHEN daily_demand > 0 THEN cast(quantity_available / daily_demand as decimal(8,2)) + ELSE NULL + END + """) + + .withColumn("inventory_value", "decimal(12,2)", minValue=1000, maxValue=500000, random=True) + .withColumn("days_offset", "integer", minValue=0, maxValue=60, random=True) + + # Date arithmetic using expr + .withColumn("last_received_date", "date", expr="date_sub(snapshot_date, days_offset)") + + # Risk categorization using expr + .withColumn("stockout_risk", "string", expr=""" + CASE + WHEN days_of_supply IS NULL OR days_of_supply < 3 THEN 'High' + WHEN days_of_supply < 7 THEN 'Medium' + ELSE 'Low' + END + """) +) + +# Build and drop intermediate columns +df_inventory = inventory_spec.build().drop("reserve_factor", "days_offset", "location_type_seed") + +df_inventory.write.mode("overwrite").saveAsTable("inventory") + +print(f"Created inventory table with {df_inventory.count():,} records") +print(f"Location type distribution:") +df_inventory.groupBy("location_type").count().show() +print(f"Stockout risk distribution:") +df_inventory.groupBy("stockout_risk").count().orderBy("stockout_risk").show() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 6. Shipments +# MAGIC +# MAGIC ### Learning Objectives: +# MAGIC - Creating **multiple weighted categorical columns** +# MAGIC - Working with date arithmetic (transit times) +# MAGIC - Computing derived metrics (on_time, delay_hours) +# MAGIC - Handling NULL values in calculations +# MAGIC +# MAGIC ### Pattern: Multiple Weighted Categories +# MAGIC For transport_mode with weights [60%, 15%, 20%, 5%]: +# MAGIC - 0.00-0.60: Truck (60%) +# MAGIC - 0.60-0.75: Rail (15%) +# MAGIC - 0.75-0.95: Intermodal (20%) +# MAGIC - 0.95-1.00: Air (5%) + +# COMMAND ---------- + +shipment_status = ["In Transit", "Delivered", "Delayed", "At Hub", "Out for Delivery"] +transport_modes = ["Truck", "Rail", "Intermodal", "Air"] + +shipments_spec = ( + dg.DataGenerator(spark, name="shipments", rows=NUM_SHIPMENTS, partitions=8) + .withIdOutput() + + .withColumn("shipment_id", "string", + expr="concat('SHP-', lpad(cast(id as string), 10, '0'))", + uniqueValues=NUM_SHIPMENTS) + + # FOREIGN KEY: Origin is always a distribution_center + .withColumn("origin_distribution_center_id", "integer", minValue=1, maxValue=NUM_DISTRIBUTION_CENTERS, random=True) + + # Destination can be distribution_center or Store (30% distribution_center, 70% Store) + .withColumn("destination_type_seed", "double", minValue=0, maxValue=1, random=True) + .withColumn("destination_type", "string", expr=""" + CASE + WHEN destination_type_seed < 0.3 THEN 'distribution_center' + ELSE 'Store' + END + """) + + # Create destination_id based on destination_type + .withColumn("destination_id", "integer", expr=""" + CASE + WHEN destination_type = 'distribution_center' THEN (id % 25) + 1 + ELSE (id % 1000) + 1 + END + """) + + .withColumn("product_id", "integer", minValue=1, maxValue=NUM_PRODUCTS, random=True) + + # Shipment dates + .withColumn("ship_date", "timestamp", + begin="2024-01-01 00:00:00", + end="2025-09-29 23:59:59", + random=True) + + # Transit time ranges + .withColumn("transit_days", "integer", minValue=1, maxValue=6, random=True) + .withColumn("actual_transit_days", "integer", minValue=1, maxValue=8, random=True) + .withColumn("delivery_probability", "double", minValue=0, maxValue=1, random=True) + + # Expected delivery = ship_date + transit_days (using date_add) + .withColumn("expected_delivery", "timestamp", expr="date_add(ship_date, transit_days)") + + # Actual delivery: only 80% of shipments are delivered + .withColumn("actual_delivery", "timestamp", expr=""" + CASE + WHEN delivery_probability > 0.2 THEN date_add(ship_date, actual_transit_days) + ELSE NULL + END + """) + + # On-time check: delivered AND before/at expected time + .withColumn("on_time", "boolean", expr=""" + actual_delivery IS NOT NULL AND actual_delivery <= expected_delivery + """) + + # Calculate delay in hours (can be negative for early deliveries) + .withColumn("delay_hours", "integer", expr=""" + CASE + WHEN actual_delivery IS NOT NULL THEN + cast((unix_timestamp(actual_delivery) - unix_timestamp(expected_delivery)) / 3600 as int) + ELSE NULL + END + """) + + .withColumn("quantity", "integer", minValue=100, maxValue=5000, random=True) + + # Transport mode with weighted distribution: 60% Truck, 15% Rail, 20% Intermodal, 5% Air + .withColumn("transport_mode_seed", "double", minValue=0, maxValue=1, random=True) + .withColumn("transport_mode", "string", expr=""" + CASE + WHEN transport_mode_seed < 0.60 THEN 'Truck' + WHEN transport_mode_seed < 0.75 THEN 'Rail' + WHEN transport_mode_seed < 0.95 THEN 'Intermodal' + ELSE 'Air' + END + """) + + .withColumn("carrier", "string", + values=["FastFreight", "ReliableLogistics", "ExpressTransport", "GlobalShippers"], + random=True) + + # Status with weighted distribution: 25% In Transit, 50% Delivered, 5% Delayed, 10% At Hub, 10% Out for Delivery + .withColumn("status_seed", "double", minValue=0, maxValue=1, random=True) + .withColumn("status", "string", expr=""" + CASE + WHEN status_seed < 0.25 THEN 'In Transit' + WHEN status_seed < 0.75 THEN 'Delivered' + WHEN status_seed < 0.80 THEN 'Delayed' + WHEN status_seed < 0.90 THEN 'At Hub' + ELSE 'Out for Delivery' + END + """) + + .withColumn("shipping_cost", "decimal(10,2)", minValue=50, maxValue=5000, random=True) + .withColumn("distance_miles", "integer", minValue=50, maxValue=2500, random=True) +) + +# Build and drop intermediate columns +df_shipments = shipments_spec.build().drop( + "transit_days", "actual_transit_days", "delivery_probability", + "destination_type_seed", "transport_mode_seed", "status_seed" +) + +df_shipments.write.mode("overwrite").saveAsTable("shipments") + +print(f"Created shipments table with {df_shipments.count():,} records") +print(f"Transport mode distribution:") +df_shipments.groupBy("transport_mode").count().orderBy(F.desc("count")).show() +print(f"Shipment status distribution:") +df_shipments.groupBy("status").count().orderBy(F.desc("count")).show() +display(df_shipments) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Demo Use Cases +# MAGIC +# MAGIC This dataset enables the following analytics use cases: +# MAGIC +# MAGIC ### Inventory Optimization +# MAGIC - Stockout risk identification and prediction +# MAGIC - Days of supply analysis by product/location +# MAGIC - Slow-moving inventory identification +# MAGIC +# MAGIC ### Logistics & Transportation +# MAGIC - Carrier performance scorecards (OTD%, cost, speed) +# MAGIC - Route optimization opportunities +# MAGIC - Transport mode analysis (cost vs speed tradeoffs) +# MAGIC +# MAGIC ### Order Planning +# MAGIC - Order schedule optimization +# MAGIC - Line efficiency tracking +# MAGIC - Capacity planning and utilization +# MAGIC +# MAGIC ### Supply Chain Analytics +# MAGIC - End-to-end supply chain visibility +# MAGIC - Network optimization (distribution_center placement, capacity) +# MAGIC - Working capital optimization +# MAGIC +# MAGIC ### AI/ML Use Cases +# MAGIC - Demand forecasting +# MAGIC - Predictive maintenance (production efficiency) +# MAGIC - Shipment delay prediction + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Sample Queries to Get Started +# MAGIC +# MAGIC Here are some queries you can run to explore the data. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Query 1: Current Inventory Health + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- Inventory health by location type and risk level +# MAGIC SELECT +# MAGIC location_type, +# MAGIC stockout_risk, +# MAGIC COUNT(*) as item_count, +# MAGIC SUM(inventory_value) as total_value, +# MAGIC ROUND(AVG(days_of_supply), 1) as avg_days_supply +# MAGIC FROM inventory +# MAGIC WHERE snapshot_date = (SELECT MAX(snapshot_date) FROM inventory) +# MAGIC GROUP BY location_type, stockout_risk +# MAGIC ORDER BY location_type, +# MAGIC CASE stockout_risk +# MAGIC WHEN 'High' THEN 1 +# MAGIC WHEN 'Medium' THEN 2 +# MAGIC WHEN 'Low' THEN 3 +# MAGIC END + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Query 2: Carrier Performance Comparison + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- Compare carriers on key metrics +# MAGIC SELECT +# MAGIC carrier, +# MAGIC COUNT(*) as total_shipments, +# MAGIC ROUND(AVG(CASE WHEN on_time = true THEN 100.0 ELSE 0.0 END), 1) as otd_pct, +# MAGIC ROUND(AVG(shipping_cost), 2) as avg_cost, +# MAGIC ROUND(AVG(distance_miles), 0) as avg_distance, +# MAGIC ROUND(AVG(shipping_cost / distance_miles), 3) as cost_per_mile +# MAGIC FROM shipments +# MAGIC WHERE actual_delivery IS NOT NULL +# MAGIC GROUP BY carrier +# MAGIC ORDER BY total_shipments DESC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Query 3: Supply Chain Network Overview + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- distribution_center performance and utilization +# MAGIC SELECT +# MAGIC distribution_center.distribution_center_code, +# MAGIC distribution_center.region, +# MAGIC distribution_center.capacity_pallets, +# MAGIC ROUND(distribution_center.current_utilization_pct, 1) as utilization_pct, +# MAGIC COUNT(DISTINCT i.product_id) as active_skus, +# MAGIC SUM(i.inventory_value) as inventory_value, +# MAGIC COUNT(DISTINCT s.id) as outbound_shipments_last_30d, +# MAGIC ROUND(AVG(CASE WHEN s.on_time = true THEN 100.0 ELSE 0.0 END), 1) as otd_pct +# MAGIC FROM distribution_centers distribution_center +# MAGIC LEFT JOIN inventory i ON distribution_center.id = i.location_id +# MAGIC AND i.location_type = 'distribution_center' +# MAGIC AND i.snapshot_date = (SELECT MAX(snapshot_date) FROM inventory) +# MAGIC LEFT JOIN shipments s ON distribution_center.id = s.origin_distribution_center_id +# MAGIC AND s.ship_date >= CURRENT_DATE - INTERVAL 30 DAY +# MAGIC GROUP BY distribution_center.distribution_center_code, distribution_center.region, distribution_center.capacity_pallets, distribution_center.current_utilization_pct +# MAGIC ORDER BY inventory_value DESC + +# COMMAND ---------- +