diff --git a/transfer_learning_attempt.ipynb b/transfer_learning_attempt.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fc448b9bd33c23c7185902988dedded62f058c73 --- /dev/null +++ b/transfer_learning_attempt.ipynb @@ -0,0 +1,232 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "fb812f11-3c4b-4af2-91da-ce6062cbccfe", + "metadata": {}, + "outputs": [], + "source": [ + "from flask import Flask, jsonify, request\n", + "import json\n", + "import datasets, evaluate\n", + "from transformers import pipeline\n", + "import torch\n", + "from datetime import datetime\n", + "import numpy as np\n", + "import re\n", + "from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback\n", + "\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", + "model_name = \"allenai/scibert_scivocab_uncased\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3edd79ea-0ca4-4543-8469-459f6d62603f", + "metadata": {}, + "outputs": [], + "source": [ + "CW_datasets = datasets.load_dataset(\"surrey-nlp/PLOD-CW\")\n", + "train_dataset = CW_datasets[\"train\"]\n", + "test_dataset = CW_datasets[\"test\"]\n", + "\n", + "label_encoding = {\"B-O\": 0, \"B-AC\": 1, \"B-LF\": 2, \"I-LF\": 3}\n", + "\n", + "metric = evaluate.load(\"seqeval\")\n", + "\n", + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)\n", + "\n", + " labels = []\n", + " for i, label in enumerate(examples['ner_tags']):\n", + " word_ids = tokenized_inputs.word_ids(batch_index=i)\n", + " previous_word_idx = None\n", + " label_ids = []\n", + " for word_idx in word_ids:\n", + " if word_idx is None:\n", + " label_ids.append(-100)\n", + " elif word_idx != previous_word_idx:\n", + " label_ids.append(label_encoding[label[word_idx]])\n", + " else:\n", + " label_ids.append(-100)\n", + " previous_word_idx = word_idx\n", + " labels.append(label_ids)\n", + "\n", + " tokenized_inputs[\"labels\"] = labels\n", + " return tokenized_inputs\n", + "\n", + "tokenized_datasets = CW_datasets.map(tokenize_and_align_labels, batched=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24833ca6-b7f7-457a-9964-fbcbd71e1cb6", + "metadata": {}, + "outputs": [], + "source": [ + "training_args = TrainingArguments(\n", + " output_dir='./results',\n", + " num_train_epochs=3,\n", + " per_device_train_batch_size=8,\n", + " per_device_eval_batch_size=8,\n", + " warmup_steps=500,\n", + " weight_decay=0.01,\n", + " logging_dir='./logs',\n", + " logging_steps=10,\n", + ")\n", + "\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_datasets['train'],\n", + " eval_dataset=tokenized_datasets['test'],\n", + ")\n", + "\n", + "trainer.train()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b773f9e-31ac-4e04-86d1-f0d558ee6b0f", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import Dataset\n", + "\n", + "def update_model_with_new_data(new_texts, new_labels):\n", + " # Prepare new data\n", + " new_data = {\"tokens\": new_texts, \"ner_tags\": new_labels}\n", + " new_dataset = Dataset.from_dict(new_data)\n", + " tokenized_new_dataset = new_dataset.map(tokenize_and_align_labels, batched=True)\n", + " \n", + " # Fine-tune the model with new data\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_new_dataset,\n", + " )\n", + " trainer.train()\n", + " \n", + " # Save the updated model\n", + " saved_model_path = \"./path_to_your_saved_model\"\n", + " model.save_pretrained(saved_model_path)\n", + " tokenizer.save_pretrained(saved_model_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50fbaeea-747d-4815-8f40-4e35e500193b", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", + "\n", + "# Load the pre-trained SciBERT model\n", + "model_name = \"allenai/scibert_scivocab_uncased\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) # Adjust num_labels based on your task\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d14d66-fb01-4c91-87c8-442807acc1a4", + "metadata": {}, + "outputs": [], + "source": [ + "def tokenize_and_align_labels(examples):\n", + " tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)\n", + "\n", + " labels = []\n", + " for i, label in enumerate(examples['ner_tags']):\n", + " word_ids = tokenized_inputs.word_ids(batch_index=i)\n", + " previous_word_idx = None\n", + " label_ids = []\n", + " for word_idx in word_ids:\n", + " if word_idx is None:\n", + " label_ids.append(-100)\n", + " elif word_idx != previous_word_idx:\n", + " label_ids.append(label_encoding[label[word_idx]])\n", + " else:\n", + " label_ids.append(-100)\n", + " previous_word_idx = word_idx\n", + " labels.append(label_ids)\n", + "\n", + " tokenized_inputs[\"labels\"] = labels\n", + " return tokenized_inputs\n", + "\n", + "tokenized_datasets = CW_datasets.map(tokenize_and_align_labels, batched=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e439e5b0-0edd-4f08-a186-df021fae7c9a", + "metadata": {}, + "outputs": [], + "source": [ + "def update_model_with_new_data(new_texts, new_labels):\n", + " # Prepare new data\n", + " new_data = {\"tokens\": new_texts, \"ner_tags\": new_labels}\n", + " new_dataset = Dataset.from_dict(new_data)\n", + " tokenized_new_dataset = new_dataset.map(tokenize_and_align_labels, batched=True)\n", + " \n", + " # Fine-tune the model with new data\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_new_dataset,\n", + " )\n", + " trainer.train()\n", + " \n", + " # Save the updated model\n", + " model.save_pretrained(saved_model_path)\n", + " tokenizer.save_pretrained(saved_model_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f5ea617-f4be-4760-b5c5-8e9c83df377b", + "metadata": {}, + "outputs": [], + "source": [ + "def update_model_with_new_data(new_texts, new_labels):\n", + " # Prepare new data\n", + " new_data = {\"tokens\": new_texts, \"ner_tags\": new_labels}\n", + " new_dataset = Dataset.from_dict(new_data)\n", + " tokenized_new_dataset = new_dataset.map(tokenize_and_align_labels, batched=True)\n", + " \n", + " # Fine-tune the model with new data\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_new_dataset,\n", + " )\n", + " trainer.train()\n", + " \n", + " # Save the updated model\n", + " model.save_pretrained(saved_model_path)\n", + " tokenizer.save_pretrained(saved_model_path)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "", + "name": "" + }, + "language_info": { + "name": "" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}