-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
117 lines (97 loc) · 4.12 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import io
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse, HTMLResponse
import pickle
import re
import pdfplumber # For PDF extraction
from fastapi.templating import Jinja2Templates
from fastapi import Request
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import traceback
# Initialize FastAPI app
app = FastAPI()
# Set up Jinja2 templates for HTML rendering
templates = Jinja2Templates(directory="templates")
# Load the pre-trained models, label encoder, and vectorizer
model = pickle.load(open('resumeg.pkl', 'rb')) # Load the trained model
le = pickle.load(open('resumele.pkl', 'rb')) # Load the label encoder
vectorizer = pickle.load(open('resumecv.pkl', 'rb')) # Load the vectorizer used during training
# NLTK setup for text preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
"""Text preprocessing function."""
text = text.lower()
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'@\w+', '', text)
text = re.sub(r'#\w+', '', text)
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
# Tokenization and stop word removal
tokens = text.split()
tokens = [word for word in tokens if word not in stop_words]
# Lemmatization
tokens = [lemmatizer.lemmatize(word) for word in tokens]
return ' '.join(tokens)
def extract_text_from_pdf(file):
"""Extract text from PDF."""
try:
# Convert byte content into a file-like object
file_like_object = io.BytesIO(file)
with pdfplumber.open(file_like_object) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text()
return text
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error extracting text from PDF: {str(e)}")
# Route to serve the HTML file
@app.get("/")
async def get_html(request: Request):
"""Serve the HTML file."""
return templates.TemplateResponse("index.html", {"request": request})
# Define prediction route
@app.post("/predict")
async def predict(file: UploadFile = File(...), text_input: str = None):
"""Predict category from resume text."""
try:
# Check if a file was uploaded
if file:
# Read uploaded file
content = await file.read()
if file.filename.endswith(".txt"):
text = content.decode('utf-8') # Decode text file content
elif file.filename.endswith(".pdf"):
text = extract_text_from_pdf(content) # Extract text from PDF file
else:
return JSONResponse(content={"error": "Unsupported file type"}, status_code=400)
elif text_input:
text = text_input # Use the simple text input if provided
else:
return JSONResponse(content={"error": "No file or text input provided"}, status_code=400)
# Preprocess text
processed_text = preprocess_text(text)
# Vectorize using the loaded vectorizer
text_vectorized = vectorizer.transform([processed_text]).toarray() # Use loaded vectorizer
# Predict category using the trained model
prediction = model.predict(text_vectorized)
# Convert the numerical prediction to its corresponding label
category = le.inverse_transform(prediction)[0]
# Return the predicted category as a response
return JSONResponse(content={"category": category})
except Exception as e:
# Log the error and provide a response with the error message
traceback.print_exc() # Prints the stack trace in the server logs for debugging
return JSONResponse(content={"error": str(e)}, status_code=500)
# Run the app
if __name__ == '__main__':
import uvicorn
uvicorn.run(app, host="127.0.0.1", port=5500)