Skip to content

Commit

Permalink
Optimized all queries using geolocation_json
Browse files Browse the repository at this point in the history
  • Loading branch information
kristof-mattei committed May 18, 2022
1 parent a5ab778 commit cac5514
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 127 deletions.
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.formatting.provider": "black"
}
229 changes: 102 additions & 127 deletions api/api_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,66 +80,55 @@ def jump_by_month(start_date, end_date, month_step=1):


def get_location_timeline(user):
qs_photos = (
Photo.objects.exclude(geolocation_json={})
.exclude(exif_timestamp=None)
.filter(owner=user)
.order_by("exif_timestamp")
)
timestamp_loc = []
paginator = Paginator(qs_photos, 5000)
for page in range(1, paginator.num_pages + 1):
current_page = [
(p.exif_timestamp, p.geolocation_json["features"][-1]["text"])
for p in paginator.page(page).object_list
]
timestamp_loc = timestamp_loc + current_page

groups = []
uniquekeys = []
for k, g in groupby(timestamp_loc, lambda x: x[1]):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)

city_start_end_duration = []
for idx, group in enumerate(groups):
city = group[0][1]
start = group[0][0]
if idx < len(groups) - 1:
end = groups[idx + 1][0][0]
else:
end = group[-1][0]
time_in_city = (end - start).total_seconds()

if time_in_city > 0:
city_start_end_duration.append([city, start, end, time_in_city])
with connection.cursor() as cursor:
raw_sql = """
WITH time_in_location AS (
SELECT
DISTINCT ON("feature" ->> 'text') "feature" ->> 'text' "location"
, MIN("api_photo"."exif_timestamp") "start"
, MAX("api_photo"."exif_timestamp") "end"
, EXTRACT(EPOCH FROM MAX("api_photo"."exif_timestamp") - MIN("api_photo"."exif_timestamp")) "duration"
FROM
"api_photo"
, json_array_elements(CAST("api_photo"."geolocation_json" -> 'features' AS JSON)) "feature"
WHERE
("api_photo"."exif_timestamp" IS NOT NULL AND "feature" ->> 'text' IS NOT NULL AND "api_photo"."owner_id" = %s)
GROUP BY
"location"
)
locs = list(set([e[0] for e in city_start_end_duration]))
colors = sns.color_palette("Paired", len(locs)).as_hex()
SELECT
"time_in_location"."location"
, "time_in_location"."start"
, "time_in_location"."end"
, "time_in_location"."duration"
FROM
"time_in_location"
WHERE
("time_in_location"."duration" > 0)
ORDER BY
"location" ASC;
"""
cursor.execute(raw_sql, [user.id])
city_start_end_duration = [
(row[0], row[1], row[2], row[3]) for row in cursor.fetchall()
]

loc2color = dict(zip(locs, colors))
colors = sns.color_palette("Paired", len(city_start_end_duration)).as_hex()

intervals_in_seconds = []
data = []
for idx, sted in enumerate(city_start_end_duration):
intervals_in_seconds.append(
data.append(
{
"data": sted[3].timestamp(),
"color": colors[idx],
"loc": sted[0],
"start": sted[1].timestamp(),
"end": sted[2].timestamp(),
"dur": sted[2].timestamp() - sted[1].timestamp(),
}
)

data = [
{
"data": [d["dur"]],
"color": loc2color[d["loc"]],
"loc": d["loc"],
"start": d["start"],
"end": d["end"],
}
for d in intervals_in_seconds
]
return data


Expand Down Expand Up @@ -313,91 +302,77 @@ def get_count_stats(user):

def get_location_clusters(user):
start = datetime.now()
photos = (
Photo.objects.filter(owner=user)
.exclude(geolocation_json={})
.only("geolocation_json")
.all()
)

coord_names = []
paginator = Paginator(photos, 5000)
for page in range(1, paginator.num_pages + 1):
for p in paginator.page(page).object_list:
for feature in p.geolocation_json["features"]:
if not feature["text"].isdigit():
coord_names.append([feature["text"], feature["center"]])

groups = []
uniquekeys = []
coord_names.sort(key=lambda x: x[0])
for k, g in groupby(coord_names, lambda x: x[0]):
groups.append(list(g)) # Store group iterator as a list
uniquekeys.append(k)

res = [[g[0][1][1], g[0][1][0], g[0][0]] for g in groups]
elapsed = (datetime.now() - start).total_seconds()
logger.info("location clustering took %.2f seconds" % elapsed)
return res
with connection.cursor() as cursor:
raw_sql = """
SELECT
DISTINCT ON ("feature" ->> 'text') "feature" ->> 'text' "location"
, "feature" -> 'center' ->> 0
, "feature" -> 'center' ->> 1
FROM
"api_photo"
, json_array_elements(CAST("api_photo"."geolocation_json" -> 'features' AS JSON)) "feature"
WHERE
("api_photo"."owner_id" = %s AND NOT ("feature" ->> 'text' ~ '^(-)?[0-9]+$'))
ORDER BY
"location" ASC;
"""
cursor.execute(raw_sql, [user.id])
res = [[row[1], row[2], row[0]] for row in cursor.fetchall()]

elapsed = (datetime.now() - start).total_seconds()
logger.info("location clustering took %.2f seconds" % elapsed)
return res


def get_photo_country_counts(user):
photos_with_gps = Photo.objects.exclude(geolocation_json=None).filter(owner=user)
geolocations = [p.geolocation_json for p in photos_with_gps]
countries = []
for gl in geolocations:
if "features" in gl.keys():
for feature in gl["features"]:
if feature["place_type"][0] == "country":
countries.append(feature["place_name"])

counts = Counter(countries)
return counts
with connection.cursor() as cursor:
raw_sql = """
SELECT
"feature" ->> 'place_name'
, COUNT("feature" ->> 'place_name')
FROM
"api_photo"
, json_array_elements(CAST("api_photo"."geolocation_json" -> 'features' AS JSON)) "feature"
WHERE
("api_photo"."owner_id" = %s AND ("feature" -> 'place_type' ->> 0) = 'country')
GROUP BY
"feature" ->> 'place_name';
"""
cursor.execute(raw_sql, [user.id])
return Counter({row[0]: row[1] for row in cursor.fetchall()})


def get_location_sunburst(user):
photos_with_gps = (
Photo.objects.exclude(geolocation_json={})
.exclude(geolocation_json=None)
.filter(owner=user)
)

if photos_with_gps.count() == 0:
return {"children": []}
geolocations = []
paginator = Paginator(photos_with_gps, 5000)
for page in range(1, paginator.num_pages + 1):
for p in paginator.page(page).object_list:
geolocations.append(p.geolocation_json)

four_levels = []
for gl in geolocations:
out_dict = {}
if "features" in gl.keys():
if len(gl["features"]) >= 1:
out_dict[1] = gl["features"][-1]["text"]
if len(gl["features"]) >= 2:
out_dict[2] = gl["features"][-2]["text"]
if len(gl["features"]) >= 3:
out_dict[3] = gl["features"][-3]["text"]
four_levels.append(out_dict)

df = pd.DataFrame(four_levels)
df = (
df.groupby(df.columns.tolist())
.size()
.reset_index()
.rename(columns={4: "count"})
)
levels = []
with connection.cursor() as cursor:
raw_sql = """
SELECT
"api_photo"."geolocation_json" -> 'features' -> -1 ->> 'text' "l1"
, "api_photo"."geolocation_json" -> 'features' -> -2 ->> 'text' "l2"
, "api_photo"."geolocation_json" -> 'features' -> -3 ->> 'text' "l3",
COUNT(*)
FROM
"api_photo"
WHERE
("api_photo"."owner_id" = %s AND jsonb_array_length("api_photo"."geolocation_json" -> 'features') >= 3)
GROUP BY
"l1"
, "l2"
, "l3"
ORDER BY
"l1"
, "l2"
, "l3"
"""
cursor.execute(raw_sql, [user.id])
levels = [[row[0], row[1], row[2], row[3]] for row in cursor.fetchall()]

data_structure = {"name": "Places I've visited", "children": []}
palette = sns.color_palette("hls", 10).as_hex()

for data in df.iterrows():

current = data_structure
depth_cursor = current["children"]
for i, item in enumerate(data[1][:-2]):
for data in levels:
depth_cursor = data_structure["children"]
for i, item in enumerate(data[0:-2]):
idx = None
j = None
for j, c in enumerate(depth_cursor):
Expand All @@ -410,11 +385,11 @@ def get_location_sunburst(user):
idx = len(depth_cursor) - 1

depth_cursor = depth_cursor[idx]["children"]
if i == len(data[1]) - 3:
if i == len(data) - 3:
depth_cursor.append(
{
"name": "{}".format(list(data[1])[-2]),
"value": list(data[1])[-1],
"name": data[-2],
"value": data[-1],
"hex": random.choice(palette),
}
)
Expand Down

0 comments on commit cac5514

Please sign in to comment.