PyPSA · cdgaete · Apr 3, 2025 · Apr 3, 2025 · Apr 3, 2025 · Apr 5, 2025
diff --git a/.codespell.ignore b/.codespell.ignore
@@ -3,3 +3,5 @@ ue
 gud
 hel
 BU
+Nam
+FO
diff --git a/.github/workflows/type-checking.yml b/.github/workflows/type-checking.yml
diff --git a/.gitignore b/.gitignore
@@ -96,3 +96,6 @@ test.ipynb
 # temporary
 .devcontainer/
 .repoai/
+
+uv.lock
+output/
diff --git a/analysis/1_osm_basics.py b/analysis/1_osm_basics.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+OSM Tutorial Part 1: Data Loading and Configuration
+==================================================
+
+Learn how to:
+1. Load OSM power plant data
+2. Configure data processing options
+3. Handle data quality settings
+"""
+
+from powerplantmatching.core import get_config
+from powerplantmatching.data import OSM
+
+# Understanding the OSM() function
+# ================================
+# The OSM() function is a high-level interface that automatically:
+# - Downloads or loads cached OpenStreetMap data
+# - Processes raw OSM elements into power plants
+# - Applies quality filters and validation
+# - Estimates missing capacities (if enabled)
+# - Reconstructs plants from generators (if enabled)
+# - Returns a clean pandas DataFrame ready for analysis
+#
+# For more control over these steps, see tutorials 2 & 3
+
+# Example 1: Basic data loading
+# =============================
+config = get_config()
+config["target_countries"] = ["Luxembourg"]
+
+# Load with default settings
+df = OSM(config=config)
+print(f"Loaded {len(df)} power plants from Luxembourg\n")
+
+
+# Example 2: Configure data quality requirements
+# ==============================================
+# The OSM module can filter data based on completeness
+
+# First, get baseline count with permissive settings
+config_baseline = get_config()
+config_baseline["target_countries"] = ["Luxembourg"]
+config_baseline["OSM"]["missing_name_allowed"] = True  # Allow unnamed plants
+df_baseline = OSM(config=config_baseline)
+
+# Now apply strict requirements
+config["OSM"]["missing_name_allowed"] = False  # Reject unnamed plants
+config["OSM"]["missing_technology_allowed"] = True  # Allow missing technology
+config["OSM"]["missing_start_date_allowed"] = True  # Allow missing start dates
+
+# This will return fewer plants due to stricter requirements
+df_strict = OSM(config=config)
+print(f"With strict name requirement: {len(df_strict)} plants")
+print(f"Filtered out: {len(df_baseline) - len(df_strict)} plants without names\n")
+
+
+# Example 3: Control data processing features
+# ===========================================
+config["OSM"]["capacity_extraction"]["enabled"] = True  # Extract capacity from tags
+config["OSM"]["capacity_estimation"]["enabled"] = True  # Estimate missing capacities
+config["OSM"]["units_clustering"]["enabled"] = False  # Don't cluster nearby generators
+config["OSM"]["units_reconstruction"]["enabled"] = (
+    True  # Reconstruct plants from generators
+)
+
+df_processed = OSM(config=config)
+print(f"With extraction, estimation and reconstruction: {len(df_processed)} plants\n")
+
+# Note: capacity_extraction vs capacity_estimation
+# - Extraction: Reads capacity from OSM tags (plant:output:electricity=10 MW)
+# - Estimation: Calculates capacity when missing (e.g., from area for solar)
+
+
+# Example 4: Cache behavior - force_refresh vs update
+# ===================================================
+# Two parameters control how OSM handles cached data:
+# - force_refresh: controls the OSM module's internal behavior
+# - update: controls powerplantmatching's high-level cache
+
+# Case 1: Use all caches (fastest)
+config["OSM"]["force_refresh"] = False  # Use OSM's cache
+df_cached = OSM(config=config, update=False)  # Use PPM's cache
+
+# Case 2: Update PPM cache from OSM cache
+config["OSM"]["force_refresh"] = False  # Use OSM's cache
+df_updated = OSM(config=config, update=True)  # Refresh PPM's cache
+
+# Case 3: Full refresh from OpenStreetMap (slowest)
+# config["OSM"]["force_refresh"] = True  # Download from OSM
+# df_fresh = OSM(config=config, update=True)  # Update PPM's cache
+
+# Summary:
+# - force_refresh=False, update=False: Use all cached data
+# - force_refresh=False, update=True: Refresh PPM cache from OSM cache
+# - force_refresh=True, update=True: Download fresh from OpenStreetMap
+
+
+# Example 5: Load multiple countries efficiently
+# ==============================================
+config["target_countries"] = ["Luxembourg", "Malta", "Cyprus"]
+config["OSM"]["plants_only"] = True  # Only load plants, not generators
+
+df_multi = OSM(config=config)
+print(f"Loaded {len(df_multi)} plants from 3 countries")
+
+# The module handles each country separately for memory efficiency
+
+
+# Example 6: Custom cache directory
+# ================================================
+# The OSM module supports custom cache directories via config
+# This is useful for managing large caches or sharing between projects
+
+# Method 1: Set in config.yaml
+# OSM:
+#   cache_dir: ~/osm_caches/project1  # Custom location
+#   fn: osm_data.csv                  # CSV filename (stored IN cache_dir)
+
+# Method 2: Set programmatically
+config["OSM"]["cache_dir"] = "~/osm_caches/europe"  # Will be expanded
+df_custom_cache = OSM(config=config)
+
+# Benefits:
+# - Keep large caches (6GB for 249 countries) separate from project
+# - Share cache across multiple projects
+# - Use faster/larger storage for cache
+# - Separate test/dev/prod caches
+# - All OSM data in one place (CSV + API caches)
+
+# The cache_dir path can be:
+# - Absolute: /data/osm_cache
+# - Relative: ./cache/osm (relative to data directory)
+# - With ~: ~/osm_caches/global (expands to home directory)
+
+# The CSV cache file (osm_data.csv) is stored INSIDE cache_dir
+# Structure:
+#   cache_dir/
+#   ├── osm_data.csv     # CSV cache (all countries)
+#   ├── plants/          # API cache
+#   ├── generators/      # API cache
+#   └── units/           # API cache
+
+print(f"✓ Example 6 complete! Loaded {len(df_custom_cache)} plants")
+
+
+# Example 7: Understanding source and technology mapping
+# ======================================================
+# OSM data uses various tags that are mapped to standard categories
+# This ensures consistency across different tagging conventions
+
+# The mapping is defined in config.yaml under OSM section:
+# - source_mapping: Maps OSM generator:source tags to standard fuel types
+# - technology_mapping: Maps OSM generator:method tags to standard technologies
+
+# Standard fuel types (see powerplantmatching.CONSTANT_FUELTYPE):
+# ['Bioenergy', 'Geothermal', 'Hard Coal', 'Hydro', 'Lignite',
+#  'Natural Gas', 'Nuclear', 'Oil', 'Other', 'Solar', 'Wind']
+
+# Standard technologies (see powerplantmatching documentation):
+# ['CCGT', 'OCGT', 'Steam Turbine', 'Combustion Engine',
+#  'Run-Of-River', 'Reservoir', 'Pumped Storage',
+#  'Onshore', 'Offshore', 'PV', 'CSP']
+
+# Example mappings from config.yaml:
+# source_mapping:
+#   Solar: [solar, photovoltaic, solar_thermal, pv]
+#   Wind: [wind, wind_power, wind_turbine]
+#   Natural Gas: [gas, natural_gas, lng]
+
+# This means:
+# - generator:source=solar → Fueltype="Solar"
+# - generator:source=gas → Fueltype="Natural Gas"
+# - generator:method=photovoltaic → Technology="PV"
+
+# You can extend mappings for regional variations:
+config["OSM"]["source_mapping"]["Solar"].append("sonnenkraft")  # German
+config["OSM"]["technology_mapping"]["PV"].append("fotovoltaico")  # Spanish
+
+# Reload with extended mappings
+df_extended = OSM(config=config, update=True)
+
+print("\n✓ Mapping example complete!")
diff --git a/analysis/2_osm_cache_and_quality.py b/analysis/2_osm_cache_and_quality.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+OSM Tutorial Part 2: Cache Management and Data Quality
+=====================================================
+
+Learn how to:
+1. Manage the OSM cache system
+2. Track data quality and rejections
+3. Download data for new countries
+
+Cache Structure:
+The OSM module uses a unified cache directory containing:
+- osm_data.csv: Combined CSV cache for all countries
+- plants/: Raw plant data from OpenStreetMap
+- generators/: Generator data from OpenStreetMap
+- units/: Processed unit data
+
+You can set a custom cache location in config.yaml:
+OSM:
+  cache_dir: ~/osm_caches/global
+  fn: osm_data.csv
+"""
+
+from powerplantmatching.core import get_config
+from powerplantmatching.osm import (
+    find_outdated_caches,
+    get_country_coverage_data,
+    populate_cache,
+    print_coverage_report,
+)
+
+# Example 1: Check what's in the cache
+# ====================================
+print("=== Current Cache Status ===")
+
+# You can specify a custom cache directory
+# If not specified, it uses the value from config.yaml
+# or defaults to ./osm_cache
+data = get_country_coverage_data(
+    cache_dir=None,  # Uses config value or default
+    check_live_counts=False,  # Don't query live OSM data
+)
+
+print_coverage_report(
+    coverage_data=data,
+    show_missing=False,
+    check_live_counts=False,
+    show_outdated_only=False,
+)
+
+# Using a specific cache directory:
+# get_country_coverage_data(cache_dir="~/osm_caches/europe")
+
+# Note: check_live_counts=True would:
+# - Query the Overpass API for current element counts
+# - Compare cached vs. live data to identify outdated caches
+# - Show which countries have new power plants since last download
+# - This is slower as it makes API calls for each country
+
+
+# Example 2: Find outdated caches
+# ===============================
+# Identify countries where OSM has new data since last download
+
+print("\n=== Checking for Outdated Data ===")
+outdated = find_outdated_caches(
+    threshold=0.95,  # Flag if cache has <95% of current OSM data
+    check_specific_countries=["Germany", "France", "Spain"],
+)
+
+if outdated:
+    print(f"Found {len(outdated)} countries with outdated data:")
+    for country in outdated[:3]:  # Show first 3
+        print(f"  {country['name']}: {country['total_missing']} new elements")
+else:
+    print("All checked countries are up to date!")
+
+
+# Example 3: Populate cache for new countries
+# ===========================================
+print("\n=== Downloading New Data ===")
+
+# Download data for small countries
+result = populate_cache(
+    countries=["Liechtenstein", "Monaco"],
+    cache_dir=None,  # Uses config value or default ./osm_cache
+    force_refresh=False,  # Skip if already cached
+    show_progress=True,  # Show download progress
+)
+
+# Or use a custom cache directory:
+# result = populate_cache(
+#     countries=["Kenya", "Uganda"],
+#     cache_dir="~/osm_caches/africa",
+#     force_refresh=False,
+#     show_progress=True,
+# )
+
+print("\nResults:")
+print(f"  Successfully downloaded: {result['succeeded']}")
+print(f"  Already cached: {result['skipped']}")
+print(f"  Failed: {result['failed']}")
+
+
+# Example 4: Understanding rejections
+# ===================================
+# See why some OSM elements were rejected during processing
+
+from powerplantmatching.osm import OverpassAPIClient, RejectionTracker, Units, Workflow
+
+config = get_config()["OSM"]
+config["missing_name_allowed"] = False  # Strict: require names
+
+# Process with rejection tracking
+rejection_tracker = RejectionTracker()
+units = Units()
+
+with OverpassAPIClient(cache_dir=None) as client:  # Uses config value
+    workflow = Workflow(client, rejection_tracker, units, config)
+    workflow.process_country_data("Malta")  # Use Malta instead of Kenya
+
+# Analyze rejections
+print("\n=== Data Quality Report for Malta ===")
+print(f"Valid power plants: {len(units)}")
+print(f"Rejected elements: {rejection_tracker.get_total_count()}")
+
+if rejection_tracker.get_total_count() > 0:
+    print("\nTop rejection reasons:")
+    for reason, count in list(rejection_tracker.get_summary().items())[:3]:
+        print(f"  {reason}: {count}")
+
+    # Save detailed rejection report
+    import os
+
+    os.makedirs("output", exist_ok=True)
+    rejection_tracker.generate_report().to_csv(
+        "output/malta_rejections.csv", index=False
+    )
+    print("\nDetailed rejection report saved to output/malta_rejections.csv")
+
+
+# Example 5: Force refresh specific countries
+# ==========================================
+# Update cache for countries with significant changes
+
+# This would re-download even if cached
+# result = populate_cache(
+#     countries=["South Africa"],
+#     force_refresh=True,  # Force new download
+#     show_progress=True
+# )
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ ue @@
     gud
     hel
     BU
+    Nam
+    FO