Tutorial · Interactive notebook

Getting started with EUBUCCO

Download, query, filter, and visualize Europe’s 322 million building footprints — with runnable code for GeoPandas, DuckDB, and more. Every cell below is taken straight from the notebook, outputs included.

  • Python
  • GeoPandas
  • DuckDB
  • Folium
  • H3

This notebook shows how to get started with downloading and using the EUBUCCO dataset.

Access data

Option 1: Download single region and load as GeoDataFrame

bash
curl --progress-bar -C - -o CH04.parquet "https://s3.eubucco.com/eubucco/v0.2/buildings/parquet/nuts_id=CH04/CH04.parquet"
bash
# Alternatively, use the AWS CLI to download the file:
aws s3 cp s3://eubucco/v0.2/buildings/parquet/nuts_id=CH04/CH04.parquet . \
    --endpoint-url https://s3.eubucco.com \
    --no-sign-request
python
import geopandas as gpd

gdf = gpd.read_parquet("CH04.parquet")

Option 2: Stream directly into GeoDataFrame

python
import geopandas as gpd

region = "CH04"
s3_path = f"s3://eubucco/v0.2/buildings/parquet/nuts_id={region}/{region}.parquet"
storage_opts = {
    "anon": True,
    "client_kwargs": {"endpoint_url": "https://s3.eubucco.com"}
}

gdf = gpd.read_parquet(s3_path, storage_options=storage_opts)

Option 3: Query with DuckDB (SQL)

python
import duckdb

con = duckdb.connect()
con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute("INSTALL spatial; LOAD spatial;")

# Specify the S3 endpoint 
con.execute("SET s3_endpoint='s3.eubucco.com';") 
con.execute("SET s3_url_style='path';")
con.execute("SET s3_region='eu';")
con.execute("SET s3_access_key_id='';")
con.execute("SET s3_secret_access_key='';")
python
import geopandas as gpd

# Fetch data with WKB-encoded geometries
region = "CH04"
s3_path = f"s3://eubucco/v0.2/buildings/parquet/nuts_id={region}/{region}.parquet"
query = f"""
    SELECT * EXCLUDE geometry, ST_AsWKT(geometry) AS geometry
    FROM '{s3_path}' 
"""
df = con.execute(query).df()

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df["geometry"]),
    crs="EPSG:3035",
)

Derive metadata

python
gdf["country"] = gdf["region_id"].str[:2]  # EU VAT 2-digit country code
gdf["NUTS1"] = gdf["region_id"].str[:3]  # EU NUTS1 code
gdf["block_id"] = gdf["id"].str.split("-").str[0]  # Identify touching buildings

gdf["type_is_authoritative"] = gdf["type_source"].str.contains("gov")  # Identify buildings with authoritative type information
gdf["type_is_merged"] = (gdf["geometry_source"] != gdf["type_source"]) & (gdf["type_source"] != "estimated")  # Identify buildings with merged type information
gdf["type_is_estimated"] = gdf["type_source"] == "estimated"  # Identify buildings with estimated type information
gdf["type_is_inferred"] = gdf["geometry_source"] != gdf["type_source"]  # Identify buildings with type being merged or estimated

Filter data (after loading)

Filter city

python
city = gdf[gdf["city_id"] == "CHCH0121"]
city.plot(column="subtype", legend=True, figsize=(10, 8)).set_axis_off()
Output
Plot output from cell 15
python
city = gdf[gdf["city_id"] == "CHCH0247"]
city["height"] = city["height"].astype(float)
city[["id", "height", "height_source", "geometry"]].explore(
    column="height",
    cmap="YlOrRd",
    tiles="CartoDB positron",
    legend=True,
)
Output

Drop non-governmental sources

python
gov = gdf[~gdf["geometry_source"].isin(["osm", "msft"])]
gov[["id", "geometry_source"]].sample(3)
Output
id geometry_source
87193 0102229e7dc540de-0 gov-switzerland
172909 4f77ec73011c4b4a-1 gov-switzerland
121576 c0dc8eb41e844455-0 gov-switzerland

Drop estimates attributes

python
ground_truth_height = gdf[gdf["height_source"] != "estimated"]
ground_truth_height[["id", "height", "height_source"]].sample(3)
Output
id height height_source
76746 bd663c2187534357-0 7.4 gov-switzerland
187952 6c484e720422495c-0 7.0 gov-switzerland
241682 5ea9b484ec694aff-0 5.2 gov-switzerland

Drop attributes with large uncertainty

python
# Categorical: Correct type in >80% of cases
reliable_type = gdf[gdf["type_confidence"].fillna(1.0) > 0.8]

# Numerical: Precise height (uncertainty interval < 2m)
precise_height = gdf[(gdf["height_confidence_upper"] - gdf["height_confidence_lower"]).fillna(0.0) < 2.0]
precise_height[["id", "height", "height_source", "height_confidence_lower", "height_confidence_upper"]].sample(3)
Output
id height height_source height_confidence_lower height_confidence_upper
188645 98baed995c394750-0 9.8 gov-switzerland None None
6773 60a2a6f495fb4e12-0 6.3 estimated 5.8 6.7
230851 2c825f0aca48467c-0 5.5 gov-switzerland None None

Filter data (while loading)

Filter based on attributes and region

python
# Extract buildings in France taller than 50m
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT *
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE nuts_id LIKE 'FR%' AND height > 50
    LIMIT 10
"""
df = con.execute(query).df()
df[["id", "region_id", "height"]]
Output
id region_id height
0 108d4bd7a74040c9-0 FR101 65.0
1 3b7e6afc92744131-0 FR101 60.0
2 e89b7be2736e4169-0 FR101 65.0
3 4daaaf16b1b54bfc-0 FR102 55.0
4 8ed075685334418b-0 FR102 51.0
5 c369bcc13a154016-0 FR102 55.8
6 c35813798c9f4162-0 FR102 55.0
7 546c9c84c5b346b5-0 FR102 51.0
8 609e7f2534974b68-0 FR102 95.0
9 9ce5dcfe94a343a3-0 FR102 53.0

Spatial filtering

python
# Extract buildings within bounding box (bbox needs to be in EPSG:3035)
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT *
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE bbox.xmin >= 5300000 AND bbox.xmax <= 5400000
    AND bbox.ymin >= 1880000 AND bbox.ymax <= 1920000
    LIMIT 10
"""
df = con.execute(query).df()
df[["id", "region_id", "subtype", "height", "geometry_source"]]
Output
id region_id subtype height geometry_source
0 05d37030ab874ec1-0 EL541 others 5.1 msft
1 0bc0005f72494fd7-0 EL541 detached 5.4 msft
2 0cdd11ae251849b2-0 EL541 agricultural 5.4 msft
3 0ec6ff77f9924920-0 EL541 detached 5.9 msft
4 12eec1f43e3242d2-0 EL541 detached 5.2 msft
5 183053df36f74b91-0 EL541 detached 5.5 msft
6 372fd27c76994c5e-0 EL541 detached 6.0 msft
7 42b90d2ae23344b3-0 EL541 detached 6.3 msft
8 43b49459e1c64cba-0 EL541 others 5.2 msft
9 43b49459e1c64cba-1 EL541 detached 5.2 msft

Source filtering & country-level counts across Europe

python
# Count governmental buildings across Europe
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT
        count(*) AS gov_count,
        LEFT(region_id, 2) AS country
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE geometry_source NOT IN ('msft', 'osm')
    GROUP BY country
    ORDER BY gov_count DESC
"""
count = con.execute(query).df()
count
Output
gov_count country
0 61785354 DE
1 47813048 FR
2 16305302 ES
3 15963070 IT
4 14407294 PL
5 9677792 NL
6 8208819 BE
7 5677226 DK
8 5404654 FI
9 3979152 CZ
10 3488619 SK
11 2626901 CH
12 1927402 LT
13 1178551 SI
14 801576 EE
15 714354 AT
16 563224 CY
17 144088 LU
18 136030 MT

Analyzing regional stats

Download precomputed region-stats.parquet file from https://eubucco.com/files/.

python
stats = gpd.read_parquet("region-stats.parquet")

Attribute coverage

python
country_stats = stats.drop(columns=["geometry", "region_id", "region_name"]).groupby("country").sum()
country_stats[["n_gt_type", "n_gt_subtype", "n_gt_height", "n_gt_floors", "n_gt_construction_year"]].div(country_stats["n"], axis=0).rename(columns=lambda x: x.replace("n_gt_", "share_gt_")).round(2)
Output
share_gt_type share_gt_subtype share_gt_height share_gt_floors share_gt_construction_year
country
AT 0.17 0.10 0.08 0.02 0.00
BE 0.02 0.01 0.90 0.00 0.00
BG 0.05 0.02 0.00 0.02 0.00
CH 0.02 0.02 0.91 0.00 0.00
CY 0.02 0.01 0.00 0.64 0.00
CZ 0.70 0.54 0.04 0.03 0.02
DE 0.64 0.39 0.44 0.16 0.00
DK 0.00 0.00 0.00 0.00 0.00
EE 0.02 0.01 0.70 0.00 0.00
EL 0.03 0.02 0.00 0.01 0.00
ES 0.91 0.24 0.03 0.86 0.90
FI 0.01 0.01 0.03 0.01 0.00
FR 0.50 0.14 0.82 0.44 0.41
HR 0.07 0.02 0.00 0.01 0.00
HU 0.11 0.03 0.00 0.01 0.00
IE 0.55 0.17 0.05 0.08 0.00
IT 0.38 0.12 0.63 0.00 0.06
LT 0.01 0.01 0.01 0.00 0.00
LU 0.05 0.03 0.78 0.00 0.00
LV 0.08 0.04 0.00 0.03 0.00
MT 0.00 0.00 0.88 0.00 0.00
NL 0.04 0.02 0.84 0.00 0.95
NO 0.78 0.56 0.01 0.04 0.00
PL 0.05 0.03 0.66 0.05 0.00
PT 0.08 0.03 0.02 0.01 0.00
RO 0.03 0.01 0.00 0.01 0.00
SE 0.13 0.07 0.01 0.02 0.00
SI 0.01 0.01 0.77 0.14 0.00
SK 0.77 0.76 0.83 0.00 0.00
UK 0.36 0.10 0.33 0.06 0.00
python
stats["height_coverage"] = stats["n_gt_height"] / stats["n"]

tooltip = [
    "n",
    "n_gov",
    "n_osm",
    "n_msft",
    "n_gt_type",
    "n_gt_subtype",
    "n_gt_height",
    "n_gt_floors",
    "n_floors_0_3",
    "n_floors_4_6",
    "n_floors_7_inf",
    "n_type_residential",
    "n_type_non_residential",
]
stats["n"] = stats["n"].div(1000).round()
stats.explore(
    "n",
    legend=True,
    tiles="CartoDB positron",
    cmap="Blues",
    legend_kwds={"caption": "Number of Buildings (in thousands)"},
    tooltip=tooltip,
)
Output

H3-grid aggregation and visualization

python
# Simple aggregated visualization with H3 hexagons
import h3pandas

gdf_floor = gdf[["geometry", "height"]].copy()
gdf_floor["height"] = gdf_floor["height"].astype(float)
gdf_floor["floor_area"] = ((gdf_floor.area * gdf_floor["height"]) / 1000 / 1000).round(2)
gdf_floor["geometry"] = gdf_floor.centroid.to_crs("EPSG:4326")
h3_grid = gdf_floor.h3.geo_to_h3_aggregate(resolution=8, operation="sum")

# precision
h3_grid.explore(
    column="floor_area",
    cmap="YlGn",
    tiles="CartoDB positron",
    tooltip=["floor_area"],
    legend=True,
)
Output
python
con.execute("INSTALL h3 FROM community; LOAD h3;")
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
h3_resolution = 5

query = f"""
WITH buildings AS (
    SELECT
        height,
        ST_Transform(
            ST_Point(
                (bbox.ymin + bbox.ymax) / 2,
                (bbox.xmin + bbox.xmax) / 2
            ),
            'EPSG:3035',
            'EPSG:4326'
        ) AS centroid
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE height IS NOT NULL
    LIMIT 1000000 -- comment out to run query at scale
),

h3_stats AS (
    SELECT
        h3_latlng_to_cell_string(
            ST_X(centroid),
            ST_Y(centroid),
            {h3_resolution}
        ) AS h3,
        AVG(height) AS avg_height,
        COUNT(*) AS n_buildings
    FROM buildings
    GROUP BY h3
)

SELECT
    h3,
    avg_height,
    n_buildings,
    h3_cell_to_boundary_wkt(h3) AS geometry
FROM h3_stats
"""

df = con.execute(query).df()

h3_grid = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df["geometry"]),
    crs="EPSG:4326",
)

h3_grid[h3_grid["n_buildings"] > 100].explore(
    column="avg_height",
    vmax=20,
    cmap="YlOrRd",
    tiles="CartoDB positron",
    legend=True,
    tooltip=["avg_height", "n_buildings"],
)
Output

Try it yourself — no install required

Live SQL on the EUBUCCO data lake from your browser

This runs DuckDB-WASM in your browser and reads an EUBUCCO region file (the Zürich region, CH04) straight over HTTP — no download, no backend. Edit the SQL and hit run.