Friday, April 21, 2023

DuckDB ASOF JOIN Example

Here's a quick example of duckdb's new ASOF JOIN.

Problem: we have a time-based price table; if you have a sale time that falls in the gaps, you get NULL price and total.

Solution: "ASOF JOIN" picks a good value for "in the gap" sales.

Create a price table and sales table.


create table prices as (

    SELECT '2001-01-01 00:16:00'::TIMESTAMP + INTERVAL (v) MINUTE AS ticker_time,

        v as unit_price

    FROM range(0,5) vals(v)

);


create table sales(item text, sale_time timestamp, quantity int);

insert into sales values('a', '2001-01-01 00:18:00', 10);

insert into sales values('b', '2001-01-01 00:18:30', 20);

insert into sales values('c', '2001-01-01 00:19:00', 30);


select * from prices;

┌─────────────────────┬────────────┐

│     ticker_time     │ unit_price │

│      timestamp      │   int64    │

├─────────────────────┼────────────┤

│ 2001-01-01 00:16:00 │          0 │

│ 2001-01-01 00:17:00 │          1 │

│ 2001-01-01 00:18:00 │          2 │ No unit_price for 18:30!

│ 2001-01-01 00:19:00 │          3 │

│ 2001-01-01 00:20:00 │          4 │

└─────────────────────┴────────────┘

select * from sales;

┌─────────┬─────────────────────┬──────────┐

│  item   │      sale_time      │ quantity │

│ varchar │      timestamp      │  int32   │

├─────────┼─────────────────────┼──────────┤

│ a       │ 2001-01-01 00:18:00 │       10 │

│ b       │ 2001-01-01 00:18:30 │       20 │

│ c       │ 2001-01-01 00:19:00 │       30 │

└─────────┴─────────────────────┴──────────┘

Normal Left Join, problem for the 18:30 sale!


-- no price value for 18:30, so item b's unit_price and total are NULL!


select s.*, p.unit_price, s.quantity * p.unit_price as total

 from sales s left join prices p

   on s.sale_time = p.ticker_time;


┌─────────┬─────────────────────┬──────────┬────────────┬───────┐

│  item   │      sale_time      │ quantity │ unit_price │ total │

│ varchar │      timestamp      │  int32   │   int64    │ int64 │

├─────────┼─────────────────────┼──────────┼────────────┼───────┤

│ a       │ 2001-01-01 00:18:00 │       10 │          2 │    20 │

│ c       │ 2001-01-01 00:19:00 │       30 │          3 │    90 │

│ b       │ 2001-01-01 00:18:30 │       20 │       NULL │  NULL │

└─────────┴─────────────────────┴──────────┴────────────┴───────┘

ASOF Join, picks a good price for the 18:30 sale!


-- using asof, 18:30 "rounds down" to use the 18:00 unit_price


select s.*, p.unit_price, s.quantity * p.unit_price as total_cost

  from sales s asof left join prices p

    on s.sale_time >= p.ticker_time;


┌─────────┬─────────────────────┬──────────┬────────────┬────────────┐

│  item   │      sale_time      │ quantity │ unit_price │ total_cost │

│ varchar │      timestamp      │  int32   │   int64    │   int64    │

├─────────┼─────────────────────┼──────────┼────────────┼────────────┤

│ a       │ 2001-01-01 00:18:00 │       10 │          2 │         20 │

│ b       │ 2001-01-01 00:18:30 │       20 │          2 │         40 │

│ c       │ 2001-01-01 00:19:00 │       30 │          3 │         90 │

└─────────┴─────────────────────┴──────────┴────────────┴────────────┘


Duckdb Full Text Search Example

I learned a lot from Laurens Kuiper's blog post Testing out DuckDB's Full Text Search Extension, but unfortunately couldn't find the data set he was using for testing.

So, here's a simple "hello world" example using the easily accessible (and parsable!) kjv.txt. Note that everything past the table creation can be done in the duckdb cli.

% cat duckdb-full-text-search.py      

import re
import duckdb

# -------- prepare the data -----------
fd = open('kjv.txt') # https://www.o-bible.com/download/kjv.txt
fd.readline() # skip first line
data = []
for line in fd.readlines():
    line = line.rstrip()
    # book, chap, verse, body = re.match(r'(\d?[A-Za-z]+)(\d+):(\d+)\s+(.*)', line).groups()
    ref, body = re.match(r'(\d?[A-Za-z]+\d+:\d+)\s+(.*)', line).groups()
    data.append((ref,body,))

# -------- create the table -----------
db = duckdb.connect()
db.cursor().execute("create table corpus(ref text, body text)")
db.cursor().executemany("insert into corpus(ref, body) values($1, $2)", data)

# -------- everything below could be run in the duckdb cli ---------

# -------- create the index -----------
db.cursor().execute(
    """
    install 'fts';
    load fts;
    pragma create_fts_index('corpus', 'ref', 'ref', 'body');
    """)

# -------- full text query -----------
print(db.sql("""
    select fts_main_corpus.match_bm25(ref, 'whale') as score,
      ref, body as "search for 'whale'"
    from corpus
    where score is not null
    order by score;
    """))

% python3 ./duckdb-full-text-search.py
┌───────────────────┬──────────┬───────────────────────────────────────────────┐
│       score       │   ref    │              search for 'whale'               │
│      double       │ varchar  │                    varchar                    │
├───────────────────┼──────────┼───────────────────────────────────────────────┤
│   2.7248255618541 │ Eze32:2  │ Son of man, take up a lamentation for Phara…  │
│ 3.839526928067141 │ Ge1:21   │ And God created great whales, and every liv…  │
│ 3.839526928067141 │ Mat12:40 │ For as Jonas was three days and three night…  │
│ 6.497660955190547 │ Job7:12  │ Am I a sea, or a whale, that thou settest a…  │
└───────────────────┴──────────┴───────────────────────────────────────────────┘