duckdb basics
in-memory
duckdb can query files and dataframes
import duckdb
import polars as pl
duckdb.sql("SELECT * FROM 'example.csv'") # directly query a CSV file
duckdb.sql("SELECT * FROM 'example.parquet'") # directly query a Parquet file
duckdb.sql("SELECT * FROM 'example.json'") # directly query a JSON file
polars_df = pl.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM polars_df")
What's returned is called Relation. duckdb can turn Relation back to dataframes/disk.
duckdb.sql("SELECT 42").fetchall() # Python objects
duckdb.sql("SELECT 42").df() # Pandas DataFrame
duckdb.sql("SELECT 42").pl() # Polars DataFrame
duckdb.sql("SELECT 42").arrow() # Arrow Table
duckdb.sql("SELECT 42").fetchnumpy() # NumPy Arrays
duckdb.sql("SELECT 42").write_parquet("out.parquet") # Write to a Parquet file
duckdb.sql("SELECT 42").write_csv("out.csv") # Write to a CSV file
duckdb.sql("COPY (SELECT 42) TO 'out.parquet'") # Copy to a Parquet file
connection
import duckdb
with duckdb.connect("file.db") as con:
con.sql("CREATE TABLE test (i INTEGER)")
con.sql("INSERT INTO test VALUES (42)")
con.table("test").show()
# the context manager closes the connection automatically
extensions
to interoperate with sqlite
core extensions e.g. ducklake, azure, excel ...
example
in this example, we load csv files into a duckdb file, copy a table from an existing sqlite file into it using the sqlite extension, then copy all tables in the duckdb file into a new sqlite file.
the goal of this script is to use duckdb's schema inference to harmonize csv date format for sqlite to consume. The reason for the conversion to sqlite is that other parts of the codebase assumes sqlite db in the backend.
# script to load CSV files into DuckDB and convert to SQLite; use duckdb's schema inference.
import os
import glob
import re
import sqlite3
import duckdb
original = "og.db" # assume there is a table called table_metadata in the db
def parse_filename(filename_string):
"""
remove leading numbers and dashes, replace non-alphanumeric characters by underscores.
"""
parsed_name = re.sub(r'^\d+-', '', filename_string)
cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', parsed_name)
return cleaned_name
def load_csv_files_as_separate_tables(csv_folder_path, db_file_path):
"""
Loads each CSV file from a specified folder into its own separate table
in a DuckDB database file. The table name will be derived from the CSV filename.
Args:
csv_folder_path (str): The path to the folder containing the CSV files.
db_file_path (str): The path where the DuckDB database file will be created/stored.
"""
# Connect to DuckDB. If the file doesn't exist, it will be created.
with duckdb.connect(database=db_file_path, read_only=False) as con:
csv_files = glob.glob(os.path.join(csv_folder_path,'*.csv'))
if not csv_files: return
for csv_path in csv_files:
base_filename = os.path.splitext(os.path.basename(csv_path))[0]
table_name = parse_filename(base_filename)
# auto_detect=TRUE is very helpful for inferring schema
con.execute(f"""
CREATE OR REPLACE TABLE "{table_name}" AS
SELECT * FROM read_csv('{csv_path}', auto_detect=TRUE);
""")
def convert_duckdb_to_sqlite(duckdb_file, sqlite_file):
"""
Converts a DuckDB database to a SQLite database by copying all tables.
Args:
duckdb_file (str): Path to the existing DuckDB database file.
sqlite_file (str): Path where the new SQLite database file will be created.
"""
con_duck = duckdb.connect(database=duckdb_file)
con_duck.execute("INSTALL sqlite; LOAD sqlite;")
con_duck.execute(f"ATTACH '{sqlite_file}' AS new_sqlite_db (TYPE sqlite);") # empty
con_duck.execute(f"ATTACH '{original}' AS original (TYPE sqlite);") # contains table_metadata
con_duck.sql("""CREATE OR REPLACE TABLE table_metadata as (select * from original.table_metadata)""")
duckdb_tables = con_duck.execute("SHOW TABLES;").fetchall() # should see table_metadata in it
for table_name_tuple in duckdb_tables:
table_name = table_name_tuple[0]
con_duck.execute(f"CREATE TABLE new_sqlite_db.\"{table_name}\" AS SELECT * FROM \"{table_name}\";")
con_duck.close()
print(f"Successfully converted '{duckdb_file}' to '{sqlite_file}'.")
if __name__ == "__main__":
csv_folder_path = "./raw_datasets"
db_file_path = "000_duck.db"
sqlite_file_path = "001_sqlite.db"
load_csv_files_as_separate_tables(csv_folder_path, db_file_path)
convert_duckdb_to_sqlite(db_file_path, sqlite_file_path)