Skip to content
Snippets Groups Projects
Commit 324f83f4 authored by Abhijeet's avatar Abhijeet
Browse files

nlp model

parent d9578f56
No related branches found
No related tags found
No related merge requests found
app.py 0 → 100644
from flask import Flask, request, jsonify
import joblib
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.util import ngrams
import sklearn_crfsuite
import gensim.downloader as api
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
app = Flask(__name__)
# Load the trained model
model = joblib.load('model/crf_model.joblib')
def preprocess_input(data):
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
sentence_tokens = []
sentence_pos_tags = []
for token in data:
lower_token = token.lower()
if lower_token not in stop_words:
stemmed_token = stemmer.stem(lower_token)
sentence_tokens.append(stemmed_token)
pos_tag = nltk.pos_tag([token])[0][1]
sentence_pos_tags.append(pos_tag)
# No n-grams in the predictions
full_tokens = sentence_tokens
full_pos_tags = sentence_pos_tags
return full_tokens, full_pos_tags
@app.route('/predict', methods=['POST'])
def predict():
data = request.get_json(force=True)
tokens, pos_tags = preprocess_input(data['tokens'])
features = extract_features_with_word2vec(tokens, pos_tags, word2vec)
predictions = model.predict([features])[0] # Get predictions for the first (and only) sentence
# Ensure predictions correspond to the number of original tokens
original_token_predictions = predictions[:len(data['tokens'])]
return jsonify(original_token_predictions)
def extract_features_with_word2vec(sent, pos_tags, model):
vector_size = model.vector_size
sent_features = []
for word, pos in zip(sent, pos_tags):
if word in model:
word_vec = model[word]
else:
word_vec = np.zeros(vector_size)
word_features = {
'bias': 1.0,
'word.lower()': word,
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.isdigit()': word.isdigit(),
'pos': pos
}
for i in range(vector_size):
word_features[f'w2v_{i}'] = word_vec[i]
sent_features.append(word_features)
return sent_features
# Load Word2Vec model
word2vec = api.load('word2vec-google-news-300')
if __name__ == '__main__':
app.run(debug=True)
\ No newline at end of file
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment