diff --git a/testing.ipynb b/testing.ipynb deleted file mode 100644 index 79d241f225d53014ca9f858378e91dbf728944d9..0000000000000000000000000000000000000000 --- a/testing.ipynb +++ /dev/null @@ -1,345 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3f7f7977-2d7f-45d9-a5ce-860e9f12b167", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", - " torch.utils._pytree._register_pytree_node(\n", - "/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", - " torch.utils._pytree._register_pytree_node(\n" - ] - } - ], - "source": [ - "from flask import Flask, jsonify, request, render_template\n", - "import json\n", - "import os\n", - "import logging\n", - "import pandas as pd\n", - "import datasets, evaluate\n", - "from transformers import pipeline\n", - "import torch\n", - "from datetime import datetime\n", - "from functools import partial\n", - "import numpy as np\n", - "import seaborn as sns\n", - "import string\n", - "import nltk\n", - "import re\n", - "import time\n", - "\n", - "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "59de7380-6213-4d2c-9c21-d2e51a242c98", - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - " \n", - "dataset = load_dataset(\"surrey-nlp/PLOD-filtered\")\n", - "dataset=dataset['train']" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e381b9eb-06ea-4333-adea-3aae055d6a56", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1247" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "max(len(example['tokens']) for example in dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c9fccb03-9080-4d79-9323-2a7e29e31c47", - "metadata": {}, - "outputs": [], - "source": [ - "def filter_long(data):\n", - " return len(data['tokens']) <= 500\n", - "dataset = dataset.filter(filter_long)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "39c416a9-bdde-44c5-a88e-962ec39e6c51", - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "import random\n", - "\n", - "def print_random_tokens(dataset):\n", - " # Get the total number of rows in the dataset\n", - " num_rows = len(dataset)\n", - " \n", - " # Generate 1000 random unique indices from the dataset\n", - " random_indices = random.sample(range(num_rows), 1000)\n", - " \n", - " # Retrieve the 'tokens' from these random indices\n", - " random_tokens = dataset.select(random_indices)['tokens']\n", - " \n", - " # Print each list of tokens\n", - " for tokens in random_tokens:\n", - " print(tokens)\n", - "\n", - "# Load the dataset\n", - "# dataset = load_dataset(\"surrey-nlp/PLOD-unfiltered\")\n", - "# train_dataset = dataset[\"train\"]\n", - "\n", - "# # Example usage of the function\n", - "# print_random_tokens(train_dataset)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "15fb65b5-33dd-43d4-92a7-de043b4334a9", - "metadata": {}, - "outputs": [], - "source": [ - "app = Flask(__name__)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "64969fe0-4820-457a-ac94-b737ee928727", - "metadata": {}, - "outputs": [], - "source": [ - "@app.route('/use-pretrained', methods=['GET'])\n", - "def use_pretrained():\n", - " \"\"\"Endpoint to load and use a pre-trained model.\"\"\"\n", - " try:\n", - " # Load the pre-trained model and tokenizer\n", - " global loaded_tokenizer, loaded_model\n", - " loaded_tokenizer = AutoTokenizer.from_pretrained(\"SciBERT-finetuned-NER\")\n", - " loaded_model = AutoModelForTokenClassification.from_pretrained(\"SciBERT-finetuned-NER\")\n", - " return jsonify(success=\"Pre-trained model loaded successfully\")\n", - " except Exception as e:\n", - " return jsonify(error=str(e)), 500" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "3358edd7-ced6-4a2e-b838-ac354c9c0809", - "metadata": {}, - "outputs": [], - "source": [ - "@app.route('/predict', methods=['POST'])\n", - "## Train must be run before this\n", - "## run from command line with: curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": }' localhost:8080/predict\n", - "## examples:\n", - "## curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": \"For this purpose the Gothenburg Young Persons Empowerment Scale (GYPES) was developed.\"}' localhost:8080/predict\n", - "## curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": \"Recent work by us and others suggest that the host’s heat shock protein 90 (Hsp90) chaperone can modulate the evolutionary paths traversed by viruses [18, 19].\"}' localhost:8080/predict\n", - "def predict():\n", - " inputs = request.get_json().get('input')\n", - " converted_inputs = split_string(inputs)\n", - " predictions = predict_tags(converted_inputs, loaded_tokenizer, loaded_model, label_encoding)\n", - "\n", - " ner_tags = [i[1] for i in predictions]\n", - " save_results(converted_inputs, ner_tags)\n", - " return jsonify(predictions = str(predictions))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "92347951-af52-4c1f-b81f-f89cd3272c3a", - "metadata": {}, - "outputs": [], - "source": [ - "@app.route('/test-model', methods=['GET'])\n", - "def test_model():\n", - " start_time = time.time()\n", - " \"\"\"Endpoint to test the pre-trained model on 1000 random dataset samples.\"\"\"\n", - " dataset = load_dataset(\"surrey-nlp/PLOD-unfiltered\", split='train')\n", - "\n", - " def filter_long(data):\n", - " return len(data['tokens']) <= 400\n", - " dataset = dataset.filter(filter_long)\n", - " \n", - " sample_indices = random.sample(range(len(dataset)), 20000)\n", - " sampled_data = dataset.select(sample_indices)\n", - "\n", - " results = []\n", - " print(\"in test_model\")\n", - " for item in sampled_data:\n", - " # Join tokens to form a single string as the model expects a sequence\n", - " input_text = \" \".join(item['tokens'])\n", - " # Tokenize the text\n", - " inputs = loaded_tokenizer(input_text, return_tensors=\"pt\")\n", - " # Get model predictions\n", - " with torch.no_grad():\n", - " outputs = loaded_model(**inputs)\n", - " predictions = torch.argmax(outputs.logits, dim=-1)\n", - " tokens = loaded_tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())\n", - " predicted_tags = [dataset.features['ner_tags'].feature.int2str(p) for p in predictions.squeeze().tolist()]\n", - "\n", - " # Combine tokens and their predicted tags\n", - " token_predictions = list(zip(tokens, predicted_tags))\n", - " results.append({'text': input_text, 'predictions': token_predictions})\n", - " total_time = time.time() - start_time\n", - " print(\"Total time taken: \" + str(total_time))\n", - "\n", - " return jsonify(results)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e00a3561-f701-4482-866e-68d26cc8d3d8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " * Serving Flask app '__main__'\n", - " * Debug mode: off\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\n", - " * Running on http://127.0.0.1:8080\n", - "Press CTRL+C to quit\n", - "/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:260: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n", - " torch.utils._pytree._register_pytree_node(\n", - "127.0.0.1 - - [24/May/2024 11:21:35] \"GET /use-pretrained HTTP/1.1\" 200 -\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "in test_model\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[2024-05-24 11:28:18,175] ERROR in app: Exception on /test-model [GET]\n", - "Traceback (most recent call last):\n", - " File \"/Users/meenusathyanarayanan/.local/lib/python3.11/site-packages/flask/app.py\", line 1455, in wsgi_app\n", - " response = self.full_dispatch_request()\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/.local/lib/python3.11/site-packages/flask/app.py\", line 869, in full_dispatch_request\n", - " rv = self.handle_user_exception(e)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/.local/lib/python3.11/site-packages/flask/app.py\", line 867, in full_dispatch_request\n", - " rv = self.dispatch_request()\n", - " ^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/.local/lib/python3.11/site-packages/flask/app.py\", line 852, in dispatch_request\n", - " return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/var/folders/3c/_rc81dfd35755sd95j1m_6zc0000gn/T/ipykernel_50926/1002882848.py\", line 23, in test_model\n", - " outputs = loaded_model(**inputs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n", - " return self._call_impl(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n", - " return forward_call(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py\", line 1756, in forward\n", - " outputs = self.bert(\n", - " ^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n", - " return self._call_impl(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n", - " return forward_call(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py\", line 1015, in forward\n", - " embedding_output = self.embeddings(\n", - " ^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1511, in _wrapped_call_impl\n", - " return self._call_impl(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1520, in _call_impl\n", - " return forward_call(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/Users/meenusathyanarayanan/anaconda3/lib/python3.11/site-packages/transformers/models/bert/modeling_bert.py\", line 238, in forward\n", - " embeddings += position_embeddings\n", - "RuntimeError: The size of tensor a (534) must match the size of tensor b (512) at non-singleton dimension 1\n", - "127.0.0.1 - - [24/May/2024 11:28:18] \"GET /test-model HTTP/1.1\" 500 -\n" - ] - } - ], - "source": [ - "if __name__ == '__main__':\n", - "\t# Entry point for running on the local machine\n", - "\t# host is localhost; port is 8080; this file is index (.py)\n", - "\tapp.run(host='127.0.0.1', port=8080, debug=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c24f86ed-2e75-4cb2-a66a-0db6a18aeb05", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05340c7e-9ba7-464e-9853-e6011581cee7", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}