Skip to content

Commit

Permalink
improved stripper and filter
Browse files Browse the repository at this point in the history
  • Loading branch information
alandgh committed Jun 13, 2022
1 parent 88b53a4 commit b0f1165
Showing 1 changed file with 80 additions and 110 deletions.
190 changes: 80 additions & 110 deletions menu_me/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,25 @@
import os
from google.oauth2 import service_account
import json

######### LOCAL ENV ##############
# from dotenv import load_dotenv, find_dotenv

# #Connecting with GCP
# env_path = find_dotenv()
# load_dotenv(env_path)
# GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
# GOOGLE_CX = os.getenv('GOOGLE_CX')
# CREDENTIALS_JSON_GOOGLE_CLOUD = os.getenv('CREDENTIALS_JSON_GOOGLE_CLOUD')

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
GOOGLE_CX = os.environ.get('GOOGLE_CX')
CREDENTIALS_JSON_GOOGLE_CLOUD = os.environ.get('CREDENTIALS_JSON_GOOGLE_CLOUD')

###############################
###### Google Vision API ######
###############################
def detect_text():
def detect_text(path):
"""Detects text in the file."""
from google.cloud import vision

Expand All @@ -30,7 +36,7 @@ def detect_text():

client = vision.ImageAnnotatorClient(credentials=credentials)
image=vision.Image()
image.source.image_uri='https://storage.googleapis.com/menu_me_bucket/img.jpg'
image.source.image_uri=path

response = client.text_detection(image=image)

Expand All @@ -51,43 +57,44 @@ def strip(response):
chars_to_remove = '0123456789!"\'#$%&()*+,-./:;<=>?@[\]^_`{|}~♦●★‒…£¡™¢∞§¶•ªº–≠≠œ∑´®†¥¨≤≥÷ç√€'

# remove entry if it exactly matches any of these
drop_exact_words = ['sandwiches','restaurant','menu', 'restaurant menu','thank you','drinks',
drop_exact_words = ['sandwiches','restaurant','menu',
'restaurant menu','thank you','drinks',
'appetizer','appetizers','mains','dessert',
'side','sides','side order','breakfast','lunch'
'dinner','supper','starter','starters','local',
'fresh','food','main']
'fresh','food','main','your','logo','brand name']

# remove these words from entry
words_to_remove = ['menu','restaurant','price','appetizer',
'appetizers','course','price','extra','extras']

# remove entry if it contains any of these
drop_contain_words = ['tax','consumer','advisory','illness']
drop_contain_words = ['tax','consumer','advisory','illness','facebook','instagram']

# remove entry if it starts with any of these
drop_start_words = ['include','includes','including','lorem','with','and',
drop_start_words = ['add','include','includes','including','lorem','with','and',
'served','serve']

# drop entry if it contains fewer chars than minimum
min_length = 4


text = response.text_annotations[0].description
menu_original = text.split('\n')

menu_chars_removed = []
for item in menu_original:
for char in chars_to_remove:
item = item.replace(char,'')
item = item.replace(char,' ')
menu_chars_removed.append(item)

menu_exact_matches_dropped = []
for item in menu_chars_removed:
if item.lower() in drop_exact_words:
pass
else:
menu_exact_matches_dropped.append(item)

menu_words_removed = []
for item in menu_exact_matches_dropped:
temporary = []
Expand All @@ -96,7 +103,7 @@ def strip(response):
temporary.append(word)
remaining_words = ' '.join(temporary)
menu_words_removed.append(remaining_words)

menu_contains_dropped = []
for item in menu_words_removed:
temporary = []
Expand All @@ -108,129 +115,118 @@ def strip(response):
temporary.append(word)
remaining_words = ' '.join(temporary)
menu_contains_dropped.append(remaining_words)

menu_starts_dropped = []
for item in menu_contains_dropped:
temporary = item.split(' ')
if temporary[0].lower() in drop_start_words:
pass
else:
menu_starts_dropped.append(item)

menu_exact_matches_dropped = []
for item in menu_starts_dropped:
if item.lower() in drop_exact_words:
pass
else:
menu_exact_matches_dropped.append(item)

bounding_white_space_removed = [item.strip() for item in menu_exact_matches_dropped]
too_short_dropped = [item for item in bounding_white_space_removed if len(item) >= min_length]
stripped_menu = too_short_dropped


duplicates_dropped = []
for item in too_short_dropped:
if item not in duplicates_dropped:
duplicates_dropped.append(item)


stripped_menu = duplicates_dropped

print(menu_original)
return(stripped_menu)


##################################
###### Image Search API ######
##################################

def search_image(query):
from google_images_search import GoogleImagesSearch
from google.cloud import vision

print(f'searching for {query}...')
print()

gis = GoogleImagesSearch(GOOGLE_API_KEY,GOOGLE_CX)

_search_params = {
'q': f'{query} recipe',
'num': 1,
'imgSize': 'large',
#'imgSize': 'large',
'imgType': 'photo',
'imgColorType': 'color'}

gis.search(search_params=_search_params)
print('fetching image:')
if len(gis.results()) == 0:
print('no image found, not verified as food.')
print()
return None

url = gis.results()[0].url
print(url)
print()
verified_queries = ['cheeseburger','burger','pizza','fried chicken','ice cream sundae']

verified_queries = ['cheeseburger','burger','pizza','fried chicken','ice cream sundae','fuyung hai']

if query.lower() in verified_queries:
print(f'{query} already in known foods database, no need to verify!')
print()
return url

client = vision.ImageAnnotatorClient()
image = vision.Image()
image.source.image_uri = url

response = client.label_detection(image=image, max_results=1)
label = [lab.description for lab in response.label_annotations]
score = [lab.score for lab in response.label_annotations]


text_response = client.text_detection(image=image)
texts = text_response.text_annotations
n_chars = 0
if len(texts)>0:
n_chars = len(texts[0].description)

print('verification filter:')
print('label must be Food')
print('label must be Food or Tableware')
print('score must be above .96')
print('number of chars must be below 100')
print()
print(f'label: {label}')
print(f'label score: {score}')
print(f'chars detected: {n_chars}')
print()

try:
if label[0] == 'Food' and score[0] > .96:
if (label[0] == 'Food' or label[0] == 'Tableware') and score[0] > .96 and n_chars < 100:
print('verified as food!')
print()
print(url)
print()
return url
except IndexError:
print('label missing, not verified as food')
return None


if label[0] in ['Food', 'Tableware']:
text_response = client.text_detection(image=image)
texts = text_response.text_annotations
n_chars = 0
if len(texts)>0:
n_chars = len(texts[0].description)


print('verification filter:')
print('label must be Food or Tableware')
print('score must be above .96')
print('number of chars must be below 100')
print()

print(f'label: {label}')
print(f'label score: {score}')
print(f'chars detected: {n_chars}')
print()

if (label[0] == 'Food' or label[0] == 'Tableware') and score[0] > .96 and n_chars < 100:
print('verified as food!')
print()
print(url)
print()
return url

pass

_search_params = {
'q': f'{query} recipe',
'num': 3,
# 'imgSize': 'large',
#'imgSize': 'large',
'imgType': 'photo',
'imgColorType': 'color',
'safe': 'medium'}

gis = GoogleImagesSearch(GOOGLE_API_KEY,GOOGLE_CX)
gis.search(search_params=_search_params)
urls = [result.url for result in gis.results()]
Expand All @@ -242,68 +238,42 @@ def search_image(query):
for url in urls:
print(url)
print()

print('verification filter:')
print('label must be Food')
print('score must be above .96')
print()


labels = []
scores = []
char_counts = []
for url in urls:

image.source.image_uri = url
response = client.label_detection(image=image, max_results=1)
label = [lab.description for lab in response.label_annotations]
score = [lab.score for lab in response.label_annotations]
labels.append(label)
scores.append(score)


text_response = client.text_detection(image=image)
texts = text_response.text_annotations
n_chars = 0
if len(texts)>0:
n_chars = len(texts[0].description)
char_counts.append(n_chars)

print(f'labels: {labels}')
print(f'label scores: {scores}')
print(f'chars detected: {char_counts}')
print()

try:
for label,score in zip(labels,scores):
if label[0] == 'Food' and score[0] > .96:
for label,score,n_chars in zip(labels,scores, char_counts):
try:
if (label[0] == 'Food' or label[0] == 'Tableware') and score[0] > .96 and n_chars < 100:
print('verified as food!')
print()
print(urls[labels.index(label)])
print()
return urls[labels.index(label)]
except IndexError:
print('label missing, not verified as food')
return None


if label[0] in ['Food', 'Tableware']:
char_counts = []
for url in urls:
text_response = client.text_detection(image=image)
texts = text_response.text_annotations
n_chars = 0
if len(texts)>0:
n_chars = len(texts[0].description)
char_counts.append(n_chars)

print('verification filter:')
print('label must be Food or Tableware')
print('score must be above .96')
print('number of chars must be below 100')
print()

print(f'labels: {labels}')
print(f'label score: {scores}')
print(f'chars detected: {char_counts}')
print()

for label,score,chars in zip(labels,scores,char_counts):
if (label[0] == 'Food' or label[0] == 'Tableware') and score[0] > .96 and chars < 100:
print('verified as food!')
print()
print(urls[labels.index(label)])
print()
return urls[labels.index(label)]

except:
pass

print('not verified as food.')
print()
return None
Expand Down

0 comments on commit b0f1165

Please sign in to comment.