diff --git a/testing.ipynb b/testing.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f1502eace8ef43807050cafc3455916a563be7b9 --- /dev/null +++ b/testing.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3f7f7977-2d7f-45d9-a5ce-860e9f12b167", + "metadata": {}, + "outputs": [], + "source": [ + "from flask import Flask, jsonify, request, render_template\n", + "import json\n", + "import os\n", + "import logging\n", + "import pandas as pd\n", + "import datasets, evaluate\n", + "from transformers import pipeline\n", + "import torch\n", + "from datetime import datetime\n", + "from functools import partial\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import string\n", + "import nltk\n", + "import re\n", + "import time\n", + "\n", + "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59de7380-6213-4d2c-9c21-d2e51a242c98", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + " \n", + "dataset = load_dataset(\"surrey-nlp/PLOD-filtered\")\n", + "dataset=dataset['train']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e381b9eb-06ea-4333-adea-3aae055d6a56", + "metadata": {}, + "outputs": [], + "source": [ + "max(len(example['tokens']) for example in dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9fccb03-9080-4d79-9323-2a7e29e31c47", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_long(data):\n", + " return len(data['tokens']) <= 500\n", + "dataset = dataset.filter(filter_long)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39c416a9-bdde-44c5-a88e-962ec39e6c51", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import random\n", + "\n", + "def print_random_tokens(dataset):\n", + " # Get the total number of rows in the dataset\n", + " num_rows = len(dataset)\n", + " \n", + " # Generate 1000 random unique indices from the dataset\n", + " random_indices = random.sample(range(num_rows), 1000)\n", + " \n", + " # Retrieve the 'tokens' from these random indices\n", + " random_tokens = dataset.select(random_indices)['tokens']\n", + " \n", + " # Print each list of tokens\n", + " for tokens in random_tokens:\n", + " print(tokens)\n", + "\n", + "# Load the dataset\n", + "# dataset = load_dataset(\"surrey-nlp/PLOD-unfiltered\")\n", + "# train_dataset = dataset[\"train\"]\n", + "\n", + "# # Example usage of the function\n", + "# print_random_tokens(train_dataset)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15fb65b5-33dd-43d4-92a7-de043b4334a9", + "metadata": {}, + "outputs": [], + "source": [ + "app = Flask(__name__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64969fe0-4820-457a-ac94-b737ee928727", + "metadata": {}, + "outputs": [], + "source": [ + "@app.route('/use-pretrained', methods=['GET'])\n", + "def use_pretrained():\n", + " \"\"\"Endpoint to load and use a pre-trained model.\"\"\"\n", + " try:\n", + " # Load the pre-trained model and tokenizer\n", + " global loaded_tokenizer, loaded_model\n", + " loaded_tokenizer = AutoTokenizer.from_pretrained(\"SciBERT-finetuned-NER\")\n", + " loaded_model = AutoModelForTokenClassification.from_pretrained(\"SciBERT-finetuned-NER\")\n", + " return jsonify(success=\"Pre-trained model loaded successfully\")\n", + " except Exception as e:\n", + " return jsonify(error=str(e)), 500" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3358edd7-ced6-4a2e-b838-ac354c9c0809", + "metadata": {}, + "outputs": [], + "source": [ + "@app.route('/predict', methods=['POST'])\n", + "## Train must be run before this\n", + "## run from command line with: curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": }' localhost:8080/predict\n", + "## examples:\n", + "## curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": \"For this purpose the Gothenburg Young Persons Empowerment Scale (GYPES) was developed.\"}' localhost:8080/predict\n", + "## curl -s -H \"Content-Type: application/json\" -X POST -d '{\"input\": \"Recent work by us and others suggest that the host’s heat shock protein 90 (Hsp90) chaperone can modulate the evolutionary paths traversed by viruses [18, 19].\"}' localhost:8080/predict\n", + "def predict():\n", + " inputs = request.get_json().get('input')\n", + " converted_inputs = split_string(inputs)\n", + " predictions = predict_tags(converted_inputs, loaded_tokenizer, loaded_model, label_encoding)\n", + "\n", + " ner_tags = [i[1] for i in predictions]\n", + " save_results(converted_inputs, ner_tags)\n", + " return jsonify(predictions = str(predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92347951-af52-4c1f-b81f-f89cd3272c3a", + "metadata": {}, + "outputs": [], + "source": [ + "@app.route('/test-model', methods=['GET'])\n", + "def test_model():\n", + " start_time = time.time()\n", + " \"\"\"Endpoint to test the pre-trained model on 1000 random dataset samples.\"\"\"\n", + " dataset = load_dataset(\"surrey-nlp/PLOD-unfiltered\", split='train')\n", + "\n", + " def filter_long(data):\n", + " return len(data['tokens']) <= 400\n", + " dataset = dataset.filter(filter_long)\n", + " \n", + " sample_indices = random.sample(range(len(dataset)), 20000)\n", + " sampled_data = dataset.select(sample_indices)\n", + "\n", + " results = []\n", + " print(\"in test_model\")\n", + " for item in sampled_data:\n", + " # Join tokens to form a single string as the model expects a sequence\n", + " input_text = \" \".join(item['tokens'])\n", + " # Tokenize the text\n", + " inputs = loaded_tokenizer(input_text, return_tensors=\"pt\")\n", + " # Get model predictions\n", + " with torch.no_grad():\n", + " outputs = loaded_model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=-1)\n", + " tokens = loaded_tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())\n", + " predicted_tags = [dataset.features['ner_tags'].feature.int2str(p) for p in predictions.squeeze().tolist()]\n", + "\n", + " # Combine tokens and their predicted tags\n", + " token_predictions = list(zip(tokens, predicted_tags))\n", + " results.append({'text': input_text, 'predictions': token_predictions})\n", + " total_time = time.time() - start_time\n", + " print(\"Total time taken: \" + str(total_time))\n", + "\n", + " return jsonify(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e00a3561-f701-4482-866e-68d26cc8d3d8", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == '__main__':\n", + "\t# Entry point for running on the local machine\n", + "\t# host is localhost; port is 8080; this file is index (.py)\n", + "\tapp.run(host='127.0.0.1', port=8080, debug=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}