Skip to content

SQL Example

jacques-n edited this page Dec 4, 2012 · 9 revisions

Goal

For each day, what is the total number of unique visitors.

Data Sources

events table

record: { user: "1", interaction: "add to cart", datetime: "12/1/2011 3:45pm" }

SQL Query

SELECT 
  CONVERT(date, e.datatime) AS day, 
  COUNT(DISTINCT 'e.user') as cnt 
  FROM events e
  GROUP BY day 

Logical Plan

{ op: "scan" selection: {table: 'events'} }
{ op: "transform" transforms: [{ref: "day", expr: "convert(date, events.datetime)"}]}
{ op: "group", groupings: [{ref: "dayKey", expr: "day"}, {ref: "userKey", expr: "user" }]
{ op: "aggregate", type: "simple", keys: ["dayKey", "userKey"], aggregations: [] }   // effectively a combine
{ op: "aggregate", type: "simple", keys: ["dayKey"], aggregations: [ {ref: "cnt", expr: "count(1)"} ] }

Pseudo Physical Queries

Simple

scan convert(date, datetime) as day, user
hash_aggregate(day+user, 1 as cnt1)
hash_aggregate(day, sum(cnt1) as cnt)

Distributed (small)

scan convert(date, datetime) as day, user
partial_aggregate(day+user)
--exchange( partition(day+user))--
hash_aggregate(day+user, 1 as cnt1)
partial_aggregate(day, sum(cnt1) as cnt2)
--exchange()--
hash_aggregate(day, sum(cnt) as cnt)
union_all()

Distributed (large)

scan convert(date, datetime) as day, user
partial_aggregate(day+user)
--exchange(partition(day+user))--
hash_aggregate(day+user, 1 as cnt1)
partial_aggregate(day, sum(cnt1),)
--exchange( partition(day) )--
hash_aggregate(day, sum(cnt))
--exchange()--
union_all()

Distributed (large, rack aware)

scan convert(date, datetime) as day, user
streaming_aggregate(day+user, 1 as ignore, )
exchange(rack-partition(day), partition(user))
hash_aggregate(user, 1 as cnt1)
streaming_aggregate(day, sum(cnt1) as cnt2)
exchange(partition(day))
hash_aggregate(day, sum(cnt2) as cnt)
exchange()
union_all()