Skip to content

Pandas and Dask

We have our data in the shape of 12 Parquet files.

#| file: src/pandas_and_dask.py
import pandas
import geopandas
from pathlib import Path

from download_data import taxi_data, taxi_filename
from utils import get_project_root

project_root = get_project_root()

data = taxi_data(project_root / "data")
jan = pandas.read_parquet(data.fetch(taxi_filename(year=2025, month=1)))
jan["PULocationID"].max()

shape = geopandas.read_file(project_root / "data" / "taxi_zones" / "taxi_zones.shp")

shape.to_crs("EPSG:4326").plot("borough", legend=True)

if __name__ == "__main__":
    pass