EUBUCCO

This notebook shows how to get started with downloading and using the EUBUCCO dataset.

Access data

Option 1: Download single region and load as GeoDataFrame

bash

curl --progress-bar -C - -o CH04.parquet "https://s3.eubucco.com/eubucco/v0.2/buildings/parquet/nuts_id=CH04/CH04.parquet"

bash

# Alternatively, use the AWS CLI to download the file:
aws s3 cp s3://eubucco/v0.2/buildings/parquet/nuts_id=CH04/CH04.parquet . \
    --endpoint-url https://s3.eubucco.com \
    --no-sign-request

python

import geopandas as gpd

gdf = gpd.read_parquet("CH04.parquet")

Option 2: Stream directly into GeoDataFrame

python

import geopandas as gpd

region = "CH04"
s3_path = f"s3://eubucco/v0.2/buildings/parquet/nuts_id={region}/{region}.parquet"
storage_opts = {
    "anon": True,
    "client_kwargs": {"endpoint_url": "https://s3.eubucco.com"}
}

gdf = gpd.read_parquet(s3_path, storage_options=storage_opts)

Option 3: Query with DuckDB (SQL)

python

import duckdb

con = duckdb.connect()
con.execute("INSTALL httpfs; LOAD httpfs;")
con.execute("INSTALL spatial; LOAD spatial;")

# Specify the S3 endpoint 
con.execute("SET s3_endpoint='s3.eubucco.com';") 
con.execute("SET s3_url_style='path';")
con.execute("SET s3_region='eu';")
con.execute("SET s3_access_key_id='';")
con.execute("SET s3_secret_access_key='';")

python

import geopandas as gpd

# Fetch data with WKB-encoded geometries
region = "CH04"
s3_path = f"s3://eubucco/v0.2/buildings/parquet/nuts_id={region}/{region}.parquet"
query = f"""
    SELECT * EXCLUDE geometry, ST_AsWKT(geometry) AS geometry
    FROM '{s3_path}' 
"""
df = con.execute(query).df()

# Convert to GeoDataFrame
gdf = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df["geometry"]),
    crs="EPSG:3035",
)

Derive metadata

python

gdf["country"] = gdf["region_id"].str[:2]  # EU VAT 2-digit country code
gdf["NUTS1"] = gdf["region_id"].str[:3]  # EU NUTS1 code
gdf["block_id"] = gdf["id"].str.split("-").str[0]  # Identify touching buildings

gdf["type_is_authoritative"] = gdf["type_source"].str.contains("gov")  # Identify buildings with authoritative type information
gdf["type_is_merged"] = (gdf["geometry_source"] != gdf["type_source"]) & (gdf["type_source"] != "estimated")  # Identify buildings with merged type information
gdf["type_is_estimated"] = gdf["type_source"] == "estimated"  # Identify buildings with estimated type information
gdf["type_is_inferred"] = gdf["geometry_source"] != gdf["type_source"]  # Identify buildings with type being merged or estimated

Filter data (after loading)

Filter city

python

city = gdf[gdf["city_id"] == "CHCH0121"]
city.plot(column="subtype", legend=True, figsize=(10, 8)).set_axis_off()

Output

python

city = gdf[gdf["city_id"] == "CHCH0247"]
city["height"] = city["height"].astype(float)
city[["id", "height", "height_source", "geometry"]].explore(
    column="height",
    cmap="YlOrRd",
    tiles="CartoDB positron",
    legend=True,
)

Output

Drop non-governmental sources

python

gov = gdf[~gdf["geometry_source"].isin(["osm", "msft"])]
gov[["id", "geometry_source"]].sample(3)

Output

	id	geometry_source
87193	0102229e7dc540de-0	gov-switzerland
172909	4f77ec73011c4b4a-1	gov-switzerland
121576	c0dc8eb41e844455-0	gov-switzerland

Drop estimates attributes

python

ground_truth_height = gdf[gdf["height_source"] != "estimated"]
ground_truth_height[["id", "height", "height_source"]].sample(3)

Output

	id	height	height_source
76746	bd663c2187534357-0	7.4	gov-switzerland
187952	6c484e720422495c-0	7.0	gov-switzerland
241682	5ea9b484ec694aff-0	5.2	gov-switzerland

Drop attributes with large uncertainty

python

# Categorical: Correct type in >80% of cases
reliable_type = gdf[gdf["type_confidence"].fillna(1.0) > 0.8]

# Numerical: Precise height (uncertainty interval < 2m)
precise_height = gdf[(gdf["height_confidence_upper"] - gdf["height_confidence_lower"]).fillna(0.0) < 2.0]
precise_height[["id", "height", "height_source", "height_confidence_lower", "height_confidence_upper"]].sample(3)

Output

	id	height	height_source	height_confidence_lower	height_confidence_upper
188645	98baed995c394750-0	9.8	gov-switzerland	None	None
6773	60a2a6f495fb4e12-0	6.3	estimated	5.8	6.7
230851	2c825f0aca48467c-0	5.5	gov-switzerland	None	None

Filter data (while loading)

Filter based on attributes and region

python

# Extract buildings in France taller than 50m
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT *
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE nuts_id LIKE 'FR%' AND height > 50
    LIMIT 10
"""
df = con.execute(query).df()
df[["id", "region_id", "height"]]

Output

	id	region_id	height
0	108d4bd7a74040c9-0	FR101	65.0
1	3b7e6afc92744131-0	FR101	60.0
2	e89b7be2736e4169-0	FR101	65.0
3	4daaaf16b1b54bfc-0	FR102	55.0
4	8ed075685334418b-0	FR102	51.0
5	c369bcc13a154016-0	FR102	55.8
6	c35813798c9f4162-0	FR102	55.0
7	546c9c84c5b346b5-0	FR102	51.0
8	609e7f2534974b68-0	FR102	95.0
9	9ce5dcfe94a343a3-0	FR102	53.0

Spatial filtering

python

# Extract buildings within bounding box (bbox needs to be in EPSG:3035)
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT *
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE bbox.xmin >= 5300000 AND bbox.xmax <= 5400000
    AND bbox.ymin >= 1880000 AND bbox.ymax <= 1920000
    LIMIT 10
"""
df = con.execute(query).df()
df[["id", "region_id", "subtype", "height", "geometry_source"]]

Output

	id	region_id	subtype	height	geometry_source
0	05d37030ab874ec1-0	EL541	others	5.1	msft
1	0bc0005f72494fd7-0	EL541	detached	5.4	msft
2	0cdd11ae251849b2-0	EL541	agricultural	5.4	msft
3	0ec6ff77f9924920-0	EL541	detached	5.9	msft
4	12eec1f43e3242d2-0	EL541	detached	5.2	msft
5	183053df36f74b91-0	EL541	detached	5.5	msft
6	372fd27c76994c5e-0	EL541	detached	6.0	msft
7	42b90d2ae23344b3-0	EL541	detached	6.3	msft
8	43b49459e1c64cba-0	EL541	others	5.2	msft
9	43b49459e1c64cba-1	EL541	detached	5.2	msft

Source filtering & country-level counts across Europe

python

# Count governmental buildings across Europe
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
query = f"""
    SELECT
        count(*) AS gov_count,
        LEFT(region_id, 2) AS country
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE geometry_source NOT IN ('msft', 'osm')
    GROUP BY country
    ORDER BY gov_count DESC
"""
count = con.execute(query).df()
count

Output

	gov_count	country
0	61785354	DE
1	47813048	FR
2	16305302	ES
3	15963070	IT
4	14407294	PL
5	9677792	NL
6	8208819	BE
7	5677226	DK
8	5404654	FI
9	3979152	CZ
10	3488619	SK
11	2626901	CH
12	1927402	LT
13	1178551	SI
14	801576	EE
15	714354	AT
16	563224	CY
17	144088	LU
18	136030	MT

Analyzing regional stats

Download precomputed region-stats.parquet file from https://eubucco.com/files/.

python

stats = gpd.read_parquet("region-stats.parquet")

Attribute coverage

python

country_stats = stats.drop(columns=["geometry", "region_id", "region_name"]).groupby("country").sum()
country_stats[["n_gt_type", "n_gt_subtype", "n_gt_height", "n_gt_floors", "n_gt_construction_year"]].div(country_stats["n"], axis=0).rename(columns=lambda x: x.replace("n_gt_", "share_gt_")).round(2)

Output

	share_gt_type	share_gt_subtype	share_gt_height	share_gt_floors	share_gt_construction_year
country
AT	0.17	0.10	0.08	0.02	0.00
BE	0.02	0.01	0.90	0.00	0.00
BG	0.05	0.02	0.00	0.02	0.00
CH	0.02	0.02	0.91	0.00	0.00
CY	0.02	0.01	0.00	0.64	0.00
CZ	0.70	0.54	0.04	0.03	0.02
DE	0.64	0.39	0.44	0.16	0.00
DK	0.00	0.00	0.00	0.00	0.00
EE	0.02	0.01	0.70	0.00	0.00
EL	0.03	0.02	0.00	0.01	0.00
ES	0.91	0.24	0.03	0.86	0.90
FI	0.01	0.01	0.03	0.01	0.00
FR	0.50	0.14	0.82	0.44	0.41
HR	0.07	0.02	0.00	0.01	0.00
HU	0.11	0.03	0.00	0.01	0.00
IE	0.55	0.17	0.05	0.08	0.00
IT	0.38	0.12	0.63	0.00	0.06
LT	0.01	0.01	0.01	0.00	0.00
LU	0.05	0.03	0.78	0.00	0.00
LV	0.08	0.04	0.00	0.03	0.00
MT	0.00	0.00	0.88	0.00	0.00
NL	0.04	0.02	0.84	0.00	0.95
NO	0.78	0.56	0.01	0.04	0.00
PL	0.05	0.03	0.66	0.05	0.00
PT	0.08	0.03	0.02	0.01	0.00
RO	0.03	0.01	0.00	0.01	0.00
SE	0.13	0.07	0.01	0.02	0.00
SI	0.01	0.01	0.77	0.14	0.00
SK	0.77	0.76	0.83	0.00	0.00
UK	0.36	0.10	0.33	0.06	0.00

python

stats["height_coverage"] = stats["n_gt_height"] / stats["n"]

tooltip = [
    "n",
    "n_gov",
    "n_osm",
    "n_msft",
    "n_gt_type",
    "n_gt_subtype",
    "n_gt_height",
    "n_gt_floors",
    "n_floors_0_3",
    "n_floors_4_6",
    "n_floors_7_inf",
    "n_type_residential",
    "n_type_non_residential",
]
stats["n"] = stats["n"].div(1000).round()
stats.explore(
    "n",
    legend=True,
    tiles="CartoDB positron",
    cmap="Blues",
    legend_kwds={"caption": "Number of Buildings (in thousands)"},
    tooltip=tooltip,
)

Output

H3-grid aggregation and visualization

python

# Simple aggregated visualization with H3 hexagons
import h3pandas

gdf_floor = gdf[["geometry", "height"]].copy()
gdf_floor["height"] = gdf_floor["height"].astype(float)
gdf_floor["floor_area"] = ((gdf_floor.area * gdf_floor["height"]) / 1000 / 1000).round(2)
gdf_floor["geometry"] = gdf_floor.centroid.to_crs("EPSG:4326")
h3_grid = gdf_floor.h3.geo_to_h3_aggregate(resolution=8, operation="sum")

# precision
h3_grid.explore(
    column="floor_area",
    cmap="YlGn",
    tiles="CartoDB positron",
    tooltip=["floor_area"],
    legend=True,
)

Output

python

con.execute("INSTALL h3 FROM community; LOAD h3;")
s3_path = "s3://eubucco/v0.2/buildings/parquet/*/*.parquet"
h3_resolution = 5

query = f"""
WITH buildings AS (
    SELECT
        height,
        ST_Transform(
            ST_Point(
                (bbox.ymin + bbox.ymax) / 2,
                (bbox.xmin + bbox.xmax) / 2
            ),
            'EPSG:3035',
            'EPSG:4326'
        ) AS centroid
    FROM read_parquet('{s3_path}', hive_partitioning = true)
    WHERE height IS NOT NULL
    LIMIT 1000000 -- comment out to run query at scale
),

h3_stats AS (
    SELECT
        h3_latlng_to_cell_string(
            ST_X(centroid),
            ST_Y(centroid),
            {h3_resolution}
        ) AS h3,
        AVG(height) AS avg_height,
        COUNT(*) AS n_buildings
    FROM buildings
    GROUP BY h3
)

SELECT
    h3,
    avg_height,
    n_buildings,
    h3_cell_to_boundary_wkt(h3) AS geometry
FROM h3_stats
"""

df = con.execute(query).df()

h3_grid = gpd.GeoDataFrame(
    df,
    geometry=gpd.GeoSeries.from_wkt(df["geometry"]),
    crs="EPSG:4326",
)

h3_grid[h3_grid["n_buildings"] > 100].explore(
    column="avg_height",
    vmax=20,
    cmap="YlOrRd",
    tiles="CartoDB positron",
    legend=True,
    tooltip=["avg_height", "n_buildings"],
)

Output

Getting started with EUBUCCO

Access data

Option 1: Download single region and load as GeoDataFrame

Option 2: Stream directly into GeoDataFrame

Option 3: Query with DuckDB (SQL)

Derive metadata

Filter data (after loading)

Filter city

Drop non-governmental sources

Drop estimates attributes

Drop attributes with large uncertainty

Filter data (while loading)

Filter based on attributes and region

Spatial filtering

Source filtering & country-level counts across Europe

Analyzing regional stats

Attribute coverage

H3-grid aggregation and visualization

Live SQL on the EUBUCCO data lake from your browser