May-20-2024, 08:07 AM
I want to fine tune a QnA HF model with the JSON file containing questions and answers, but I get aforementioned error.
Full code:
Full code:
# https://www.mlexpert.io/blog/alpaca-fine-tuning # https://wellsr.com/python/fine-tuning-huggingface-models-in-tensorflow-keras/ # https://learnopencv.com/fine-tuning-bert/ # https://medium.com/@karary/nlp-fine-tune-question-answering-model-%E5%AF%A6%E4%BD%9C-3-model-training-%E5%84%B2%E5%AD%98%E8%88%87-inference-13d2a5bf5c32 # https://huggingface.co/datasets/Mangacoder007/QNA-chat_app # https://www.youtube.com/watch?v=0tT5suZSdkA # https://medium.com/@anyuanay/fine-tuning-the-pre-trained-bert-model-in-hugging-face-for-question-answering-8edc76890ce0 import transformers as tf import datasets as ds import pandas as pd import numpy as np import torch import json ############## Check if CUDA is enabled. ################ # hasCUDA=torch.cuda.is_available() # print(f"CUDA Enabled? {hasCUDA}") # device="cuda" if hasCUDA else "cpu" ############## Loading file and populating data ################ dataset = ds.load_dataset("json", data_files="qna.json", split="train") input_ids=[] for i in range (len(dataset)): input_ids.append(i) dataset=dataset.add_column('input_ids', input_ids) ############## Model ########################################## modelName="./distilbert-base-cased" #or replace the model name with whatever you feel like. config=tf.AutoConfig.from_pretrained(modelName+"/config.json") model=tf.AutoModelForQuestionAnswering.from_pretrained(modelName,config=config) tokenizer=tf.AutoTokenizer.from_pretrained(modelName) if tokenizer.pad_token is None: tokenizer.add_special_tokens({'pad_token': '[PAD]'}) ############## Tokenization ####################################### tokenizer.pad_token=tokenizer.eos_token def preprocessFunction(examples): tokens=tokenizer(examples['question'], truncation=True) return tokens tokenizedDS=dataset.map(preprocessFunction, batched=True) ############## Training ####################################### trnArgs=tf.TrainingArguments( output_dir="./", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, num_train_epochs=3, remove_unused_columns=False, fp16=True ) trainer=tf.Trainer( model=model, args=trnArgs, train_dataset=dataset, eval_dataset=None, tokenizer=tokenizer ) trainer.train()Traceback:
Traceback (most recent call last): File "C:\Users\chenp\Documents\ML\ml2.py", line 22, in <module> dataset = ds.load_dataset("squad", split="train") File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 2587, in load_dataset builder_instance = load_dataset_builder( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 2259, in load_dataset_builder dataset_module = dataset_module_factory( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 1885, in dataset_module_factory return HubDatasetModuleFactoryWithoutScript( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 1195, in __init__ increase_load_count(name, resource_type="dataset") File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 289, in increase_load_count head_hf_s3(name, filename=name + ".py", dataset=(resource_type == "dataset")) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 111, in head_hf_s3 return http_head( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 462, in http_head response = _request_with_retry( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 336, in _request_with_retry response = requests.request(method=method.upper(), url=url, timeout=timeout, **params) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py", line 59, in request return session.request(method=method, url=url, **kwargs) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py", line 587, in request resp = self.send(prep, **send_kwargs) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py", line 701, in send r = adapter.send(request, **kwargs) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py", line 489, in send resp = conn.urlopen( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen httplib_response = self._make_request( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request self._validate_conn(conn) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 1042, in _validate_conn conn.connect() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py", line 419, in connect self.sock = ssl_wrap_socket( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket ssl_sock = _ssl_wrap_socket_impl( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl return ssl_context.wrap_socket(sock, server_hostname=server_hostname) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 512, in wrap_socket return self.sslsocket_class._create( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1070, in _create self.do_handshake() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1341, in do_handshake self._sslobj.do_handshake() KeyboardInterrupt ^C C:\Users\chenp\Documents\ML>python ml.py Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at ./distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. 2024-05-20 16:03:42.130339: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-05-20 16:03:42.738309: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 0%| | 0/9 [00:00<?, ?it/s]Traceback (most recent call last): File "C:\Users\chenp\Documents\ML\ml.py", line 63, in <module> trainer.train() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train return inner_training_loop( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop for step, inputs in enumerate(epoch_iterator): File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__ current_batch = next(dataloader_iter) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__ data = self._next_data() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch return self.collate_fn(data) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 271, in __call__ batch = pad_without_fast_tokenizer_warning( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning padded = tokenizer.pad(*pad_args, **pad_kwargs) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 3315, in pad padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 2763, in _get_padding_truncation_strategies raise ValueError( ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`. 0%| | 0/9 [00:00<?, ?it/s] C:\Users\chenp\Documents\ML>python ml.py Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at ./distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. 2024-05-20 16:04:04.227458: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-05-20 16:04:04.882429: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 0%| | 0/9 [00:00<?, ?it/s]Traceback (most recent call last): File "C:\Users\chenp\Documents\ML\ml.py", line 63, in <module> trainer.train() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train return inner_training_loop( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop for step, inputs in enumerate(epoch_iterator): File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__ current_batch = next(dataloader_iter) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__ data = self._next_data() File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data data = self._dataset_fetcher.fetch(index) # may raise StopIteration File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch return self.collate_fn(data) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 271, in __call__ batch = pad_without_fast_tokenizer_warning( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning padded = tokenizer.pad(*pad_args, **pad_kwargs) File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 3315, in pad padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 2763, in _get_padding_truncation_strategies raise ValueError( ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`. 0%| | 0/9 [00:00<?, ?it/s]JSON:
{"question": "Who wrote Charlie and the Chocolate Factory?", "answer": "Roald Dahl"} {"question": "Name a few ways to treat constipation naturally.", "answer": "Exercise regularly, eat more fibers, and drink more water."} {"question": "Where is the longest roller coaster located?", "answer": "Nagashima, Japan. The name of the coaster is Steel Dragon 2000."} {"question": "Who murdered JFK?", "answer": "It is said to be Harvey Oswald."} {"question": "What are the 11 herbs and spices that Colonel Sanders used in KFC?", "answer": "Nobody knows, as it's a secret."} {"question": "Who wrote Les Miserables?", "answer": "Victor Hugo"} {"question": "What is the Watergate Scandal?", "answer": "The Watergate scandal was a significant political controversy in the United States during the presidency of Richard Nixon from 1972 to 1974, ultimately resulting in Nixon's resignation. It originated from attempts by the Nixon administration to conceal its involvement in the June 17, 1972, break-in at the Democratic National Committee headquarters located in the Watergate Office Building in Washington, D.C."} {"question": "What is Obama's most famous quote?", "answer": "'Yes we can!'"} {"question": "Where did the 2008 Olympic take place?", "answer": "Beijing"} {"question": "Lentils and Chickpeas are what kind of food?", "answer": "Beans"} {"question": "Who was the disciple that Jesus loved?", "answer": "John"} {"question": "Why did the Boston Tea Party happen?", "answer": "The colonists were unhappy with the tax and restrictions imposed by the British colonists."} {"question": "What was the effect of the Boston Tea Party?", "answer": "The British imposed a new Intolerable Act, and tensions between the colonies and the British escalated."} {"question": "Who conquered the Aztec Empire?", "answer": "Hernan Cortes"} {"question": "What is the longest flight as of 2024?", "answer": "Singapore to New York, operated by Singapore Airlines."} {"question": "Name a few infamous Roman dictators.", "answer": "Caligula, Nero, and Tiberius."} {"question": "Where did the early Hungarians come from?", "answer": "They originated from the Uralic region as nomads, and then migrated to Central Europe's Carpathian basin."} {"question": "Where is the fastest roller coaster located?", "answer": "Abu Dhabi, and the coaster is known as F1."} {"question": "What are some popular painting in Uffizi Gallery?", "answer": "The Birth of Venus, Madonna of the Goldfinch, and Judith and Holofernes."} {"question": "Who wrote A Christmas Carol and Oliver Twist?", "answer": "Charles Dickens"}Suggestions on how I can fix this?