Merge pull request #1 from webaverse/add-fastapi-endpoint

Add fastapi endpoint
webaverse · Nov 2, 2022 · ebc4bae · ebc4bae
2 parents 3a29b74 + 13bb654
commit ebc4bae
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 0 deletions.
diff --git a/main.py b/main.py
@@ -0,0 +1,133 @@
+from torchvision.transforms.functional import InterpolationMode
+from torchvision import transforms
+from PIL import Image
+import requests
+import torch
+
+from fastapi import FastAPI, UploadFile, Form
+from fastapi.responses import HTMLResponse
+
+from models.blip import blip_decoder, blip_feature_extractor
+from models.blip_vqa import blip_vqa
+from models.blip_itm import blip_itm
+
+import uvicorn
+
+
+app = FastAPI()
+
+device = 'cuda:0'
+image_size = 384
+
+
+def load_image(img, image_size, device):
+	raw_image = Image.open(img).convert('RGB')
+
+	w,h = raw_image.size
+	# display(raw_image.resize((w//5,h//5)))
+
+	transform = transforms.Compose([
+		transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
+		transforms.ToTensor(),
+		transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+		])
+	image = transform(raw_image).unsqueeze(0).to(device)
+	return image
+
+
+def load_image_from_url(img_url, image_size, device):
+	raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+
+	w,h = raw_image.size
+	# display(raw_image.resize((w//5,h//5)))
+
+	transform = transforms.Compose([
+		transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
+		transforms.ToTensor(),
+		transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+		])
+	image = transform(raw_image).unsqueeze(0).to(device)
+	return image
+
+
+@app.get('/')
+def main():
+	content = '''
+	<body>
+	<form action="/upload" enctype="multipart/form-data" method="post">
+	<span>Task: </span><input name="task" type="text">
+	<br />
+	<br />
+	<input name="file" type="file">
+	<br />
+	<br />
+	<input type="submit">
+	</form>
+	</body>
+	'''
+	return HTMLResponse(content=content)
+
+
+@app.post('/upload')
+async def upload_image(task: str = Form(), file: UploadFile = Form()):
+	try:
+		img = file.file
+		image = load_image(img, image_size, device)
+
+		if task == 'image_captioning':
+			with torch.no_grad():
+				# beam search
+				caption = image_captioning_model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
+				# nucleus sampling
+				# caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)
+				return {'Caption': caption[0]}
+		if task == 'vqa':
+			with torch.no_grad():
+				caption = vqa_model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
+				return {'Caption': caption[0]}
+		if task == 'feature_extraction':
+			with torch.no_grad():
+				caption = feature_extraction_model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
+				return {'Caption': caption[0]}
+		if task == 'text_matching':
+			with torch.no_grad():
+				caption = image_text_matching_model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
+				return {'Caption': caption[0]}
+	except Exception as e:
+		return {'Error': e}
+
+
+@app.post('/upload/url')
+async def upload_image(task: str, img_url: str):
+	try:
+		image = load_image_from_url(img_url, image_size, device)
+
+		if task == 'image_captioning':
+			with torch.no_grad():
+				# beam search
+				caption = image_captioning_model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5) 
+				# nucleus sampling
+				# caption = model.generate(image, sample=True, top_p=0.9, max_length=20, min_length=5)
+				return {'Caption': caption[0]}
+	except Exception as e:
+		return {'Error': e}
+
+
+if __name__ == '__main__':
+	image_captioning_model = blip_decoder(pretrained='./models/model_base_capfilt_large.pth', image_size=image_size, vit='base')
+	image_captioning_model.eval()
+	image_captioning_model = image_captioning_model.to(device)
+
+	vqa_model = blip_vqa(pretrained='./models/model_base_vqa_capfilt_large.pth', image_size=image_size, vit='base')
+	vqa_model.eval()
+	vqa_model = vqa_model.to(device)
+
+	feature_extraction_model = blip_feature_extractor(pretrained='./models/model_base.pth', image_size=image_size, vit='base')
+	feature_extraction_model.eval()
+	feature_extraction_model = feature_extraction_model.to(device)
+
+	image_text_matching_model = blip_itm(pretrained='./models/model_base_retrieval_coco.pth', image_size=image_size, vit='base')
+	image_text_matching_model.eval()
+	image_text_matching_model = image_text_matching_model.to(device)
+
+	uvicorn.run(app, host='0.0.0.0', port=80)
diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,5 @@ timm==0.4.12
 transformers==4.15.0
 fairscale==0.4.4
 pycocoevalcap
+fastapi
+uvicorn