duckdb basics

in-memory

duckdb can query files and dataframes

import duckdb
import polars as pl

duckdb.sql("SELECT * FROM 'example.csv'")     # directly query a CSV file
duckdb.sql("SELECT * FROM 'example.parquet'") # directly query a Parquet file
duckdb.sql("SELECT * FROM 'example.json'")    # directly query a JSON file

polars_df = pl.DataFrame({"a": [42]})
duckdb.sql("SELECT * FROM polars_df")

What's returned is called Relation. duckdb can turn Relation back to dataframes/disk.

duckdb.sql("SELECT 42").fetchall()   # Python objects
duckdb.sql("SELECT 42").df()         # Pandas DataFrame
duckdb.sql("SELECT 42").pl()         # Polars DataFrame
duckdb.sql("SELECT 42").arrow()      # Arrow Table
duckdb.sql("SELECT 42").fetchnumpy() # NumPy Arrays

duckdb.sql("SELECT 42").write_parquet("out.parquet") # Write to a Parquet file
duckdb.sql("SELECT 42").write_csv("out.csv")         # Write to a CSV file
duckdb.sql("COPY (SELECT 42) TO 'out.parquet'")      # Copy to a Parquet file

connection

import duckdb

with duckdb.connect("file.db") as con:
    con.sql("CREATE TABLE test (i INTEGER)")
    con.sql("INSERT INTO test VALUES (42)")
    con.table("test").show()
    # the context manager closes the connection automatically

extensions

to interoperate with sqlite

import duckdb

con = duckdb.connect()
con.install_extension("sqlite")
con.load_extension("sqlite")

core extensions e.g. ducklake, azure, excel ...

example

in this example, we load csv files into a duckdb file, copy a table from an existing sqlite file into it using the sqlite extension, then copy all tables in the duckdb file into a new sqlite file.

the goal of this script is to use duckdb's schema inference to harmonize csv date format for sqlite to consume. The reason for the conversion to sqlite is that other parts of the codebase assumes sqlite db in the backend.

# script to load CSV files into DuckDB and convert to SQLite; use duckdb's schema inference.

import os
import glob
import re
import sqlite3
import duckdb

original = "og.db" # assume there is a table called table_metadata in the db

def parse_filename(filename_string):
    """
    remove leading numbers and dashes, replace non-alphanumeric characters by underscores.
    """
    parsed_name = re.sub(r'^\d+-', '', filename_string)
    cleaned_name = re.sub(r'[^a-zA-Z0-9_]', '_', parsed_name)
    return cleaned_name

def load_csv_files_as_separate_tables(csv_folder_path, db_file_path):
    """
    Loads each CSV file from a specified folder into its own separate table
    in a DuckDB database file. The table name will be derived from the CSV filename.

    Args:
        csv_folder_path (str): The path to the folder containing the CSV files.
        db_file_path (str): The path where the DuckDB database file will be created/stored.
    """
    # Connect to DuckDB. If the file doesn't exist, it will be created.
    with duckdb.connect(database=db_file_path, read_only=False) as con:
        csv_files = glob.glob(os.path.join(csv_folder_path,'*.csv'))
        if not csv_files: return
        for csv_path in csv_files:
            base_filename = os.path.splitext(os.path.basename(csv_path))[0]
            table_name = parse_filename(base_filename)            
            # auto_detect=TRUE is very helpful for inferring schema
            con.execute(f"""
                CREATE OR REPLACE TABLE "{table_name}" AS
                SELECT * FROM read_csv('{csv_path}', auto_detect=TRUE);
            """)


def convert_duckdb_to_sqlite(duckdb_file, sqlite_file):
    """
    Converts a DuckDB database to a SQLite database by copying all tables.

    Args:
        duckdb_file (str): Path to the existing DuckDB database file.
        sqlite_file (str): Path where the new SQLite database file will be created.
    """
    con_duck = duckdb.connect(database=duckdb_file)
    con_duck.execute("INSTALL sqlite; LOAD sqlite;")
    con_duck.execute(f"ATTACH '{sqlite_file}' AS new_sqlite_db (TYPE sqlite);") # empty 
    con_duck.execute(f"ATTACH '{original}' AS original (TYPE sqlite);") # contains table_metadata
    con_duck.sql("""CREATE OR REPLACE TABLE table_metadata as (select * from original.table_metadata)""")
    duckdb_tables = con_duck.execute("SHOW TABLES;").fetchall() # should see table_metadata in it

    for table_name_tuple in duckdb_tables:
        table_name = table_name_tuple[0]
        con_duck.execute(f"CREATE TABLE new_sqlite_db.\"{table_name}\" AS SELECT * FROM \"{table_name}\";")

    con_duck.close()
    print(f"Successfully converted '{duckdb_file}' to '{sqlite_file}'.")

if __name__ == "__main__":

    csv_folder_path = "./raw_datasets"  
    db_file_path = "000_duck.db"  
    sqlite_file_path = "001_sqlite.db"

    load_csv_files_as_separate_tables(csv_folder_path, db_file_path)
    convert_duckdb_to_sqlite(db_file_path, sqlite_file_path)