Skip to content

Commit

Permalink
Adding fully implemented, passing tests locally
Browse files Browse the repository at this point in the history
  • Loading branch information
kcelebi committed Jun 11, 2023
1 parent 67c75e9 commit 70a799e
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 32 deletions.
176 changes: 144 additions & 32 deletions src/google_flight_analysis/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

__all__ = ['Scrape', '_Scrape', 'ScrapeObjects']

date_format = "%Y-%m-%d"
'''
Iterative scraping
If value in DB dont run just return query
Expand Down Expand Up @@ -64,16 +65,131 @@ def __call__(self, *args):
# Ability to combine a going and return trip
# Can use this to chain multiple trips
def __add__(self, other):
raise NotImplementedError()

# chain trip
if self.dest == other.origin:
...
assert self.type == other.type, "Can't add {a} with {b}. See docs".format(a = self.type, b = other.type)

# round trip
if self.origin == other.dest:
...
assert (self.data.empty and other.data.empty) or (not self.data.empty and not other.data.empty), "Error with addition. Both queries must either be unused or queried."

obj_type = self.type
if obj_type == 'one-way':
# adding two one-ways could be a round trip
if self.origin == other.dest and self.dest == other.origin:

if self.data.empty:
return Scrape(self.origin[0], self.dest[0], *self.date, *other.date)
else:
obj = Scrape(self.origin[0], self.dest[0], *self.date)
obj.data = pd.concat([self.data, other.data])
return obj

# otherwise, must be chain
if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
obj.data = pd.concat([self.data, other.data])
return obj


elif obj_type == 'round-trip':
# adding two round-trips makes it into a chain, possible perfect chain
# example of perfect: (JFK --> IST, IST --> JFK) + (JFK --> CDG, CDG --> JFK)

assert datetime.strptime(self.date[1], date_format) < datetime.strptime(other.date[0], date_format), "Dates are not in order. Make sure to provide them in increasing order in YYYY-MM-DD format."

# check perfect chain
if self.origin[0] == other.origin[0]:
if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], date] for i, date in enumerate(other.date)] + [[other.dest[-1]]])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], date] for i, date in enumerate(other.date)] + [[other.dest[-1]]])
)
obj.data = pd.concat([self.data, other.data])
return obj

# otherwise, return chain
if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
obj.data = pd.concat([self.data, other.data])
return obj

elif obj_type == 'chain-trip':
# must result in chain
# check last date of self < first of other

assert datetime.strptime(self.date[-1], date_format) < datetime.strptime(other.date[0], date_format), "Dates are not in order. Make sure to provide them in increasing order in YYYY-MM-DD format."

if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
obj.data = pd.concat([self.data, other.data])
return obj

elif obj_type == 'perfect-chain':
# only outputs perfect chain if origins are same

assert datetime.strptime(self.date[-1], date_format) < datetime.strptime(other.date[0], date_format), "Dates are not in order. Make sure to provide them in increasing order in YYYY-MM-DD format."

# perfect-chain
if self.origin[0] == other.origin[0]:

if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], date] for i, date in enumerate(other.date)] + [[other.dest[-1]]])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], date] for i, date in enumerate(other.date)] + [[other.dest[-1]]])
)
obj.data = pd.concat([self.data, other.data])
return obj

# otherwise, just chain
if self.data.empty:
return Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
else:
obj = Scrape(
*self.unpack([[self.origin[i], self.dest[i], date] for i, date in enumerate(self.date)]),
*self.unpack([[other.origin[i], other.dest[i], date] for i, date in enumerate(other.date)])
)
obj.data = pd.concat([self.data, other.data])
return obj

else:
raise NotImplementedError()


def __str__(self):
return self.__repr__()

Expand Down Expand Up @@ -101,6 +217,20 @@ def clone(self, *args):
obj._set_properties(*args)
return obj

def unpack(self, args):
arr = []
for arg in args:
arr += arg
return arr

def combine(self, other, *args):
if self.data is None:
return Scrape(*args)

obj = Scrape(*args)
obj.data = pd.concat([self.data, other.data])
return obj

'''
Set properties upon scraper called.
'''
Expand Down Expand Up @@ -141,6 +271,8 @@ def _set_properties(self, *args):
assert len(args[2]) == 10 and type(args[2]) == str, "Issue with arg 2, see docs"
assert len(args[3]) == 10 and type(args[3]) == str, "Issue with arg 3, see docs"

assert datetime.strptime(args[2], date_format) < datetime.strptime(args[3], date_format), "Dates are not in order. Make sure to provide them in increasing order in YYYY-MM-DD format."

self._origin, self._dest, self._date = [args[0], args[1]], [args[1], args[0]], args[2:]

assert len(self._origin) == len(self._dest) == len(self._date), "Issue with array lengths, talk to dev"
Expand All @@ -155,6 +287,9 @@ def _set_properties(self, *args):
assert len(args[i]) == 3 and type(args[i]) == str, "Issue with arg {}, see docs".format(i)
assert len(args[i + 1]) == 3 and type(args[i+1]) == str, "Issue with arg {}, see docs".format(i+1)
assert len(args[i + 2]) == 10 and type(args[i + 2]) == str, "Issue with arg {}, see docs".format(i+2)

if i > 0:
assert datetime.strptime(self._date[-1], date_format) < datetime.strptime(args[i + 2], date_format), "Dates are not in order ({d1} > {d2}). Make sure to provide them in increasing order in YYYY-MM-DD format.".format(d1 = self._date[-1], d2 = args[i+2])

self._origin += [args[i]]
self._dest += [args[i + 1]]
Expand All @@ -166,7 +301,7 @@ def _set_properties(self, *args):


# perfect-chain
elif len(args) >= 4 and len(args) % 2 == 1:
elif len(args) >= 4 and len(args) % 2 == 1 and len(args[-1]) == 3 and type(args[-1]) == str:
assert len(args[0]) == 3 and type(args[0]) == str, "Issue with arg 0, see docs"
assert len(args[1]) == 10 and type(args[1]) == str, "Issue with arg 1, see docs"

Expand All @@ -175,6 +310,7 @@ def _set_properties(self, *args):
for i in range(2, len(args)-1, 2):
assert len(args[i]) == 3 and type(args[i]) == str, "Issue with arg {}, see docs".format(i)
assert len(args[i + 1]) == 10 and type(args[i + 1]) == str, "Issue with arg {}, see docs".format(i+1)
assert datetime.strptime(self._date[-1], date_format) < datetime.strptime(args[i + 1], date_format), "Dates are not in order ({d1} > {d2}). Make sure to provide them in increasing order in YYYY-MM-DD format.".format(d1 = self._date[-1], d2 = args[i+1])

self._origin += [args[i]]
self._dest += [args[i]]
Expand All @@ -187,20 +323,9 @@ def _set_properties(self, *args):
self._url = self._make_url()
self._type = 'perfect-chain'



else:
raise NotImplementedError()

'''(
self._origin, self._dest, self._date_leave, self._date_return
) = args if len(args) >= 4 else args + (None,)
if len(args) >= 4:
self._url = [self._make_url(leave = True), self._make_url(leave = False)]
else:
self._url = self._make_url()'''

@property
def origin(self):
return self._origin
Expand Down Expand Up @@ -249,19 +374,8 @@ def type(self):
Scrape the object. Add support for multiple queries, iterative.
'''
def _scrape_data(self, driver):

results = [self._get_results(url, self._date[i], driver) for i, url in enumerate(self._url)]

self._data = pd.concat(results, ignore_index = True)

'''if self._date_return is not None:
leave_result = self._get_results(self._url[0], driver)
return_result = self._get_results(self._url[1], driver)
self._data = pd.concat([leave_result, return_result], ignore_index = True)
return
leave_result = self._get_results(self._url, driver)
self._data = leave_result'''


def _make_url(self):
Expand Down Expand Up @@ -313,8 +427,6 @@ def _clean_results(result, date):

@staticmethod
def _make_url_request(url, driver):
#driver = webdriver.Chrome()#'/Users/kayacelebi/Downloads/chromedriver')
#driver.maximize_window()
driver.get(url)

# Waiting and initial XPATH cleaning
Expand Down
34 changes: 34 additions & 0 deletions tests/test_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@ def func_0():
# perfect chain
res6 = Scrape("JFK", "2023-11-10", "AMS", "2023-11-17", "CDG", "2023-11-20", "IST", "2023-11-25", "JFK")

# addition tests
res7 = Scrape("JFK", "IST", "2023-12-05")
res8 = Scrape("IST", "JFK", "2023-12-30")

res9 = Scrape("JFK", "IST", "2023-12-05", "2023-12-30")
res10 = Scrape("JFK", "CDG", "2024-01-10", "2024-02-10")


print(res9.origin[0] == res10.origin[0], res9.type, res10.type)

print('12')
out12 = res1 + res2
print('78')
out78 = res7 + res8
print('910')
out910 = res9 + res10
print('done?')

'''os.system('rm tests/test_data/LGA-RDU.csv')
os.system('rm tests/test_data/CDG-IST.csv')
os.system('rm -rf tests/test_data/.access')
Expand Down Expand Up @@ -121,6 +139,22 @@ def test_22():
assert res6.date == ["2023-11-10", "2023-11-17", "2023-11-20", "2023-11-25"], "Test 22 Failed."


#-------ADDITION

def test_23():
assert out12.type == 'chain-trip', "Test 23 Failed."

def test_24():
assert out12.data.shape[0] > 0, "Test 24 Failed."

def test_25():
assert out78.type == 'round-trip', "Test 25 Failed."

def test_26():
assert out910.type == 'perfect-chain', "Test 26 Failed."



'''#-------CACHE 1
def test_11():
Expand Down

0 comments on commit 70a799e

Please sign in to comment.