Skip to content

Commit

Permalink
fix location conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
robert.sloan committed May 25, 2021
1 parent a1a062a commit 1686ffc
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 138 deletions.
85 changes: 45 additions & 40 deletions twitter-harvester/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime, timedelta
from time import sleep
from random import random
import json

TwitterUrl = namedtuple(
typename='TwitterUrl',
Expand All @@ -15,15 +16,16 @@

def auth(couchdb: CouchDB, redis: Redis, endpoint: str):
# Select valid token
now = datetime.utcnow().timestamp()
min_window = (datetime.utcnow() - timedelta(minutes=15)).timestamp()

complete = False
while not complete:
token = None
while not token:
result = couchdb["tokens"].get_query_result(
selector={
try:
couchdb.connect()
while not token:
now = datetime.utcnow().timestamp()
min_window = (datetime.utcnow() - timedelta(minutes=15)).timestamp()

selector = {
# ensure less than x calls per window
endpoint: {
"$or": [
Expand All @@ -36,49 +38,52 @@ def auth(couchdb: CouchDB, redis: Redis, endpoint: str):
"last_used": {
"$or": [
{"$exists": False},
{"$lt": now - 2}
{"$lt": now - 1}
]
}
},
sort=[{"last_used": "asc"}],
limit=1).all()
if len(result) == 0:
print("No valid token, waiting...")
sleep(1)
continue
else:
token = result[0]

doc = couchdb["tokens"][token["_id"]]
doc.fetch()

# Update last_used
needs_new_window = endpoint not in doc or doc[endpoint]["since"] < min_window
if needs_new_window:
doc.update({
"last_used": now,
endpoint: {
"since": now,
"total": 1
}
})
else:
doc.update({
"last_used": now,
endpoint: {
**doc[endpoint],
"total": doc[endpoint]["total"] + 1
}
})

try:
result = couchdb["tokens"].get_query_result(
selector=selector,
sort=[{"last_used": "asc"}],
limit=1).all()
if len(result) == 0:
print("No valid token, waiting...")
couchdb.disconnect()
sleep(random() * 0.5 + 0.5)
couchdb.connect()
else:
token = result[0]

doc = couchdb["tokens"][token["_id"]]

# Update last_used
needs_new_window = endpoint not in doc or doc[endpoint]["since"] < min_window
if needs_new_window:
doc.update({
"last_used": now,
endpoint: {
"since": now,
"total": 1
}
})
else:
doc.update({
"last_used": now,
endpoint: {
**doc[endpoint],
"total": doc[endpoint]["total"] + 1
}
})

doc.save()
print("Using token: %s" % token["_id"])
complete = True
except Exception as e:
print("CouchDB Token Error", e)
print("Auth Error:", e)
sleep(random() * 0.3 + 0.1)
continue
finally:
couchdb.disconnect()

return token["token"]

Expand Down
102 changes: 53 additions & 49 deletions twitter-harvester/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,61 +5,65 @@


def load_features(filepath, couchdb: CouchDB):
if "features" not in couchdb.all_dbs():
couchdb.create_database("features", partitioned=False)
couchdb["features"].create_query_index(fields=["newest"])
couchdb["features"].create_query_index(fields=[{"oldest": "desc"}])
try:
couchdb.connect()
if "features" not in couchdb.all_dbs():
couchdb.create_database("features", partitioned=False)
couchdb["features"].create_query_index(fields=["newest"])
couchdb["features"].create_query_index(fields=[{"oldest": "desc"}])

poly_features = []
with open(filepath) as file:
content = file.read()
polygons = json.loads(content)
poly_features = polygons["features"]
poly_features = []
with open(filepath) as file:
content = file.read()
polygons = json.loads(content)
poly_features = polygons["features"]

features = []
for feature in poly_features:
if feature["geometry"] is None:
continue
coords = feature["geometry"]["coordinates"]
box = [None, None, None, None]
features = []
for feature in poly_features:
if feature["geometry"] is None:
continue
coords = feature["geometry"]["coordinates"]
box = [None, None, None, None]

for x in coords:
for y in x:
if box[0] is None or y[0] < box[0]:
box[0] = y[0]
if box[1] is None or y[1] < box[1]:
box[1] = y[1]
if box[2] is None or y[0] > box[2]:
box[2] = y[0]
if box[3] is None or y[1] > box[3]:
box[3] = y[1]
for x in coords:
for y in x:
if box[0] is None or y[0] < box[0]:
box[0] = y[0]
if box[1] is None or y[1] < box[1]:
box[1] = y[1]
if box[2] is None or y[0] > box[2]:
box[2] = y[0]
if box[3] is None or y[1] > box[3]:
box[3] = y[1]

id = sha1(json.dumps(box).encode('utf8')).digest().hex()
features.append({
"id": id,
"name": feature["properties"]["name"],
"loc_pid": feature["properties"]["loc_pid"],
"box": box
})
id = sha1(json.dumps(box).encode('utf8')).digest().hex()
features.append({
"id": id,
"name": feature["properties"]["name"],
"loc_pid": feature["properties"]["loc_pid"],
"box": box
})

docs = couchdb["features"].get_query_result(
selector={
"_id": {
"$gt": None
}
}).all()
docs = couchdb["features"].get_query_result(
selector={
"_id": {
"$gt": None
}
}).all()

known_ids = list(map(lambda doc: doc["_id"], docs))
new_features = list(filter(lambda feature: feature["id"] not in known_ids, features))
known_ids = list(map(lambda doc: doc["_id"], docs))
new_features = list(filter(lambda feature: feature["id"] not in known_ids, features))

new_docs = list(map(lambda feature: {
"_id": feature["id"],
"box": feature["box"],
"name": feature["name"],
"loc_pid": feature["loc_pid"],
"newest": None,
"oldest": None
}, new_features))
couchdb["features"].bulk_docs(new_docs)
new_docs = list(map(lambda feature: {
"_id": feature["id"],
"box": feature["box"],
"name": feature["name"],
"loc_pid": feature["loc_pid"],
"newest": None,
"oldest": None
}, new_features))
couchdb["features"].bulk_docs(new_docs)
finally:
couchdb.disconnect()

return features
Loading

0 comments on commit 1686ffc

Please sign in to comment.