Skip to content

Commit

Permalink
Refactor database
Browse files Browse the repository at this point in the history
There are a few (large-ish) database changes here:

- Store all the User-Agent headers in one `user_agents` table, and the
  parsed browser/OS names in a `browser` and `systems` table. This is
  much more space-efficient: for 22 million rows it's about 1.7G of
  storage to store it for every pageview. This is now reduced to about
  200M (~500k unique values). It also makes everything more transparent
  and easier to debug.

- Do the same for all the pathnames: these were stored for every
  pageview, as well as in some stats tables together with the title. For
  22 million rows there are about 1 million unique paths. This makes
  both the table itself as well as some indexes a lot smaller. It also
  allows some queries to be a bit more efficient as we can just do a int
  comparison instead of a string one.

  This also makes searching for paths much more efficient, as we no
  longer need to scan the large hits table, but scan the much smaller
  paths table instead.

- Rename `id` to `table_id`; this makes joins and such much easier, and
  I figured I might as well do this while I'm at it.

All in all, this reduces the hits table from about 6.5G + 3G index to
2.6G + 1.2G index, and the various _stats tables etc. are a bit smaller
as well.

This is basically how it should have been done from the start, and if I
knew more about database design a year ago I would have done things
different 🙃

All of this is a large change, and processing the migration may take a
while if you've got a large database. It takes several hours for
goatcounter.com's 22 million pageviews.

It's recommended to run a "vacuum full" to free up disk space after all
of this is finished (can't be run from transaction).
  • Loading branch information
arp242 committed Sep 19, 2020
1 parent 4e5e42a commit 0012306
Show file tree
Hide file tree
Showing 45 changed files with 2,168 additions and 423 deletions.
14 changes: 7 additions & 7 deletions admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ func (a *AdminStats) List(ctx context.Context) error {
end) as plan,
stripe,
sites.link_domain,
(select email from users where site=sites.id or site=sites.parent) as email,
(select email from users where site_id=sites.id or site_id=sites.parent) as email,
coalesce((
select sum(hit_counts.total) from hit_counts where site=sites.id
select sum(hit_counts.total) from hit_counts where site_id=sites.site_id
), 0) as total,
coalesce((
select sum(hit_counts.total) from hit_counts
where site=sites.id and hit_counts.hour >= %s
where site_id=sites.site_id and hit_counts.hour >= %s
), 0) as last_month
from sites
order by last_month desc`, interval(30)))
Expand Down Expand Up @@ -113,11 +113,11 @@ func (a *AdminSiteStat) ByID(ctx context.Context, id int64) error {
ival60 := interval(30)
err = zdb.MustGet(ctx).GetContext(ctx, a, fmt.Sprintf(`/* *AdminSiteStat.ByID */
select
coalesce((select hour from hit_counts where site=$1 order by hour desc limit 1), '1970-01-01') as last_data,
coalesce((select sum(total) from hit_counts where site=$1), 0) as count_total,
coalesce((select sum(total) from hit_counts where site=$1
coalesce((select hour from hit_counts where site_id=$1 order by hour desc limit 1), '1970-01-01') as last_data,
coalesce((select sum(total) from hit_counts where site_id=$1), 0) as count_total,
coalesce((select sum(total) from hit_counts where site_id=$1
and hour >= %[1]s), 0) as count_last_month,
coalesce((select sum(total) from hit_counts where site=$1
coalesce((select sum(total) from hit_counts where site_id=$1
and hour >= %[2]s
and hour <= %[1]s
), 0) as count_prev_month
Expand Down
2 changes: 1 addition & 1 deletion cmd/goatcounter/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func monitor() (int, error) {

query := `/* monitor */ select count(*) from hits where `
if *site > 0 {
query += fmt.Sprintf(`site=%d and `, *site)
query += fmt.Sprintf(`site_id=%d and `, *site)
}
if zdb.PgSQL(db) {
query += ` created_at > now() - interval '%d seconds'`
Expand Down
15 changes: 7 additions & 8 deletions cmd/goatcounter/reindex.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ func reindex() (int, error) {
if *since == "" {
w := ""
if site > 0 {
w = fmt.Sprintf(" where site=%d ", site)
w = fmt.Sprintf(" where site_id=%d ", site)
}

var first string
Expand Down Expand Up @@ -186,8 +186,7 @@ func dosite(
start = nnow.With(end.Add(12 * time.Hour)).BeginningOfMonth()
}

// Insert paths.
query := `select * from hits where site=$1 and created_at >= $2 and created_at <= $3`
query := `select * from hits where site_id=$1 and bot=0 and created_at>=$2 and created_at<=$3`

var pauses time.Duration
if pause > 0 {
Expand Down Expand Up @@ -223,14 +222,14 @@ func dosite(
func clearMonth(db *sqlx.DB, tables []string, month string, siteID int64) {
ctx := context.Background()

where := fmt.Sprintf(" where site=%d and cast(day as varchar) like '%s-__'", siteID, month)
where := fmt.Sprintf(" where site_id=%d and cast(day as varchar) like '%s-__'", siteID, month)
for _, t := range tables {
switch t {
case "hit_stats":
db.MustExecContext(ctx, `delete from hit_stats`+where)
case "hit_counts":
db.MustExecContext(ctx, fmt.Sprintf(
`delete from hit_counts where site=%d and cast(hour as varchar) like '%s-%%'`,
`delete from hit_counts where site_id=%d and cast(hour as varchar) like '%s-%%'`,
siteID, month))
case "browser_stats":
db.MustExecContext(ctx, `delete from browser_stats`+where)
Expand All @@ -240,7 +239,7 @@ func clearMonth(db *sqlx.DB, tables []string, month string, siteID int64) {
db.MustExecContext(ctx, `delete from location_stats`+where)
case "ref_counts":
db.MustExecContext(ctx, fmt.Sprintf(
`delete from ref_counts where site=%d and cast(hour as varchar) like '%s-%%'`,
`delete from ref_counts where site_id=%d and cast(hour as varchar) like '%s-%%'`,
siteID, month))
case "size_stats":
db.MustExecContext(ctx, `delete from size_stats`+where)
Expand All @@ -251,10 +250,10 @@ func clearMonth(db *sqlx.DB, tables []string, month string, siteID int64) {
db.MustExecContext(ctx, `delete from location_stats`+where)
db.MustExecContext(ctx, `delete from size_stats`+where)
db.MustExecContext(ctx, fmt.Sprintf(
`delete from hit_counts where site=%d and cast(hour as varchar) like '%s-%%'`,
`delete from hit_counts where site_id=%d and cast(hour as varchar) like '%s-%%'`,
siteID, month))
db.MustExecContext(ctx, fmt.Sprintf(
`delete from ref_counts where site=%d and cast(hour as varchar) like '%s-%%'`,
`delete from ref_counts where site_id=%d and cast(hour as varchar) like '%s-%%'`,
siteID, month))
}
}
Expand Down
81 changes: 58 additions & 23 deletions cron/browser_stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,46 +6,86 @@ package cron

import (
"context"
"strconv"
"sync"

"zgo.at/errors"
"zgo.at/gadget"
"zgo.at/goatcounter"
"zgo.at/zdb"
"zgo.at/zdb/bulk"
"zgo.at/zlog"
)

var (
userAgentMap map[int64][2]int64
getUAOnce sync.Once
)

func getUA(ctx context.Context, uaID int64) (browser, system int64) {
getUAOnce.Do(func() {
var ua []struct {
UserAgentID int64 `db:"user_agent_id"`
BrowserID int64 `db:"browser_id"`
SystemID int64 `db:"system_id"`
}
err := zdb.MustGet(ctx).SelectContext(ctx, &ua,
`select user_agent_id, browser_id, system_id from user_agents`)
if err != nil {
panic(err)
}

userAgentMap = make(map[int64][2]int64, len(ua))
for _, u := range ua {
userAgentMap[u.UserAgentID] = [2]int64{u.BrowserID, u.SystemID}
}
})

ua, ok := userAgentMap[uaID]
if !ok {
var u goatcounter.UserAgent
err := u.ByID(ctx, uaID)
if err != nil {
zlog.Field("uaID", uaID).Error(err)
return 0, 0
}
ua = [2]int64{u.BrowserID, u.SystemID}
userAgentMap[uaID] = ua
}

return ua[0], ua[1]
}

// TODO: add path_id here too?

func updateBrowserStats(ctx context.Context, hits []goatcounter.Hit, isReindex bool) error {
return zdb.TX(ctx, func(ctx context.Context, tx zdb.DB) error {
// Group by day + browser + version.
type gt struct {
count int
countUnique int
day string
browser string
version string
browserID int64
}
grouped := map[string]gt{}
for _, h := range hits {
if h.Bot > 0 {
continue
}

browser, version := getBrowser(h.Browser)
if browser == "" {
continue
if h.BrowserID == 0 {
h.BrowserID, _ = getUA(ctx, h.UserAgentID)
}

day := h.CreatedAt.Format("2006-01-02")
k := day + browser + version
k := day + strconv.FormatInt(h.BrowserID, 10)
v := grouped[k]
if v.count == 0 {
v.day = day
v.browser = browser
v.version = version
v.browserID = h.BrowserID
if !isReindex {
var err error
v.count, v.countUnique, err = existingBrowserStats(ctx, tx,
h.Site, day, v.browser, v.version)
h.Site, day, v.browserID)
if err != nil {
return err
}
Expand All @@ -60,18 +100,18 @@ func updateBrowserStats(ctx context.Context, hits []goatcounter.Hit, isReindex b
}

siteID := goatcounter.MustGetSite(ctx).ID
ins := bulk.NewInsert(ctx, "browser_stats", []string{"site", "day",
"browser", "version", "count", "count_unique"})
ins := bulk.NewInsert(ctx, "browser_stats", []string{"site_id", "day",
"browser_id", "count", "count_unique"})
for _, v := range grouped {
ins.Values(siteID, v.day, v.browser, v.version, v.count, v.countUnique)
ins.Values(siteID, v.day, v.browserID, v.count, v.countUnique)
}
return ins.Finish()
})
}

func existingBrowserStats(
txctx context.Context, tx zdb.DB, siteID int64,
day, browser, version string,
day string, browserID int64,
) (int, int, error) {

var c []struct {
Expand All @@ -80,8 +120,8 @@ func existingBrowserStats(
}
err := tx.SelectContext(txctx, &c, `/* existingBrowserStats */
select count, count_unique from browser_stats
where site=$1 and day=$2 and browser=$3 and version=$4 limit 1`,
siteID, day, browser, version)
where site_id=$1 and day=$2 and browser_id=$3 limit 1`,
siteID, day, browserID)
if err != nil {
return 0, 0, errors.Wrap(err, "select")
}
Expand All @@ -90,12 +130,7 @@ func existingBrowserStats(
}

_, err = tx.ExecContext(txctx, `delete from browser_stats where
site=$1 and day=$2 and browser=$3 and version=$4`,
siteID, day, browser, version)
site_id=$1 and day=$2 and browser_id=$3`,
siteID, day, browserID)
return c[0].Count, c[0].CountUnique, errors.Wrap(err, "delete")
}

func getBrowser(uaHeader string) (string, string) {
ua := gadget.Parse(uaHeader)
return ua.BrowserName, ua.BrowserVersion
}
26 changes: 10 additions & 16 deletions cron/hit_count.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package cron

import (
"context"
"strconv"

"zgo.at/goatcounter"
"zgo.at/goatcounter/cfg"
Expand All @@ -15,14 +16,12 @@ import (

func updateHitCounts(ctx context.Context, hits []goatcounter.Hit, isReindex bool) error {
return zdb.TX(ctx, func(ctx context.Context, tx zdb.DB) error {
// Group by day + path.
// Group by day + pathID
type gt struct {
total int
totalUnique int
hour string
event zdb.Bool
path string
title string
pathID int64
}
grouped := map[string]gt{}
for _, h := range hits {
Expand All @@ -31,16 +30,11 @@ func updateHitCounts(ctx context.Context, hits []goatcounter.Hit, isReindex bool
}

hour := h.CreatedAt.Format("2006-01-02 15:00:00")
k := hour + h.Path
k := hour + strconv.FormatInt(h.PathID, 10)
v := grouped[k]
if v.total == 0 {
v.hour = hour
v.path = h.Path
v.event = h.Event
}

if h.Title != "" {
v.title = h.Title
v.pathID = h.PathID
}

v.total += 1
Expand All @@ -51,20 +45,20 @@ func updateHitCounts(ctx context.Context, hits []goatcounter.Hit, isReindex bool
}

siteID := goatcounter.MustGetSite(ctx).ID
ins := bulk.NewInsert(ctx, "hit_counts", []string{"site", "path",
"title", "event", "hour", "total", "total_unique"})
ins := bulk.NewInsert(ctx, "hit_counts", []string{"site_id", "path_id",
"hour", "total", "total_unique"})
if cfg.PgSQL {
ins.OnConflict(`on conflict on constraint "hit_counts#site#path#hour" do update set
ins.OnConflict(`on conflict on constraint "hit_counts#site_id#path_id#hour" do update set
total=hit_counts.total + excluded.total,
total_unique=hit_counts.total_unique + excluded.total_unique`)
} else {
ins.OnConflict(`on conflict(site, path, hour) do update set
ins.OnConflict(`on conflict(site_id, path_id, hour) do update set
total=hit_counts.total + excluded.total,
total_unique=hit_counts.total_unique + excluded.total_unique`)
}

for _, v := range grouped {
ins.Values(siteID, v.path, v.title, v.event, v.hour, v.total, v.totalUnique)
ins.Values(siteID, v.pathID, v.hour, v.total, v.totalUnique)
}
return ins.Finish()
})
Expand Down
Loading

0 comments on commit 0012306

Please sign in to comment.