Need to fix ValueError: Asking to pad but the tokenizer does not have a padding token

alexanderDennisEnviro500 · May-20-2024, 08:07 AM

I want to fine tune a QnA HF model with the JSON file containing questions and answers, but I get aforementioned error.

Full code:

# https://www.mlexpert.io/blog/alpaca-fine-tuning
# https://wellsr.com/python/fine-tuning-huggingface-models-in-tensorflow-keras/
# https://learnopencv.com/fine-tuning-bert/
# https://medium.com/@karary/nlp-fine-tune-question-answering-model-%E5%AF%A6%E4%BD%9C-3-model-training-%E5%84%B2%E5%AD%98%E8%88%87-inference-13d2a5bf5c32
# https://huggingface.co/datasets/Mangacoder007/QNA-chat_app
# https://www.youtube.com/watch?v=0tT5suZSdkA
# https://medium.com/@anyuanay/fine-tuning-the-pre-trained-bert-model-in-hugging-face-for-question-answering-8edc76890ce0
import transformers as tf
import datasets as ds
import pandas as pd
import numpy as np
import torch
import json

############## Check if CUDA is enabled. ################
# hasCUDA=torch.cuda.is_available()
# print(f"CUDA Enabled? {hasCUDA}")
# device="cuda" if hasCUDA else "cpu"      

############## Loading file and populating data ################
dataset = ds.load_dataset("json", data_files="qna.json", split="train")
input_ids=[]

for i in range (len(dataset)):
    input_ids.append(i)

dataset=dataset.add_column('input_ids', input_ids)
############## Model ##########################################
modelName="./distilbert-base-cased"     #or replace the model name with whatever you feel like.
config=tf.AutoConfig.from_pretrained(modelName+"/config.json")
model=tf.AutoModelForQuestionAnswering.from_pretrained(modelName,config=config)
tokenizer=tf.AutoTokenizer.from_pretrained(modelName)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

############## Tokenization #######################################
tokenizer.pad_token=tokenizer.eos_token

def preprocessFunction(examples):
    tokens=tokenizer(examples['question'],  truncation=True)
    return tokens

tokenizedDS=dataset.map(preprocessFunction, batched=True)
############## Training #######################################

trnArgs=tf.TrainingArguments(
    output_dir="./",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    remove_unused_columns=False,
    fp16=True
)

trainer=tf.Trainer(
    model=model,
    args=trnArgs,
    train_dataset=dataset,
    eval_dataset=None,
    tokenizer=tokenizer
)
trainer.train()

Traceback:

Traceback (most recent call last):
  File "C:\Users\chenp\Documents\ML\ml2.py", line 22, in <module>
    dataset = ds.load_dataset("squad", split="train")
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 2587, in load_dataset
    builder_instance = load_dataset_builder(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 2259, in load_dataset_builder
    dataset_module = dataset_module_factory(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 1885, in dataset_module_factory
    return HubDatasetModuleFactoryWithoutScript(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 1195, in __init__
    increase_load_count(name, resource_type="dataset")
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\load.py", line 289, in increase_load_count
    head_hf_s3(name, filename=name + ".py", dataset=(resource_type == "dataset"))
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 111, in head_hf_s3
    return http_head(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 462, in http_head
    response = _request_with_retry(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\datasets\utils\file_utils.py", line 336, in _request_with_retry
    response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py", line 587, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\sessions.py", line 701, in send
    r = adapter.send(request, **kwargs)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\requests\adapters.py", line 489, in send
    resp = conn.urlopen(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
    httplib_response = self._make_request(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 386, in _make_request
    self._validate_conn(conn)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connectionpool.py", line 1042, in _validate_conn
    conn.connect()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\connection.py", line 419, in connect
    self.sock = ssl_wrap_socket(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\ssl_.py", line 449, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\urllib3\util\ssl_.py", line 493, in _ssl_wrap_socket_impl
    return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 512, in wrap_socket
    return self.sslsocket_class._create(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1070, in _create
    self.do_handshake()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\ssl.py", line 1341, in do_handshake
    self._sslobj.do_handshake()
KeyboardInterrupt
^C
C:\Users\chenp\Documents\ML>python ml.py
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at ./distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-05-20 16:03:42.130339: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-20 16:03:42.738309: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  0%|                                  | 0/9 [00:00<?, ?it/s]Traceback (most recent call last):
  File "C:\Users\chenp\Documents\ML\ml.py", line 63, in <module>
    trainer.train()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train
    return inner_training_loop(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop
    for step, inputs in enumerate(epoch_iterator):
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__
    current_batch = next(dataloader_iter)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
    data = self._next_data()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 271, in __call__
    batch = pad_without_fast_tokenizer_warning(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 3315, in pad
    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 2763, in _get_padding_truncation_strategies
    raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
  0%|          | 0/9 [00:00<?, ?it/s]

C:\Users\chenp\Documents\ML>python ml.py
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at ./distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2024-05-20 16:04:04.227458: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-20 16:04:04.882429: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  0%|                                  | 0/9 [00:00<?, ?it/s]Traceback (most recent call last):
  File "C:\Users\chenp\Documents\ML\ml.py", line 63, in <module>
    trainer.train()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 1859, in train
    return inner_training_loop(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\trainer.py", line 2165, in _inner_training_loop
    for step, inputs in enumerate(epoch_iterator):
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\accelerate\data_loader.py", line 454, in __iter__
    current_batch = next(dataloader_iter)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 631, in __next__
    data = self._next_data()
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\dataloader.py", line 675, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 271, in __call__
    batch = pad_without_fast_tokenizer_warning(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\data\data_collator.py", line 66, in pad_without_fast_tokenizer_warning
    padded = tokenizer.pad(*pad_args, **pad_kwargs)
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 3315, in pad
    padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
  File "C:\Users\chenp\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\tokenization_utils_base.py", line 2763, in _get_padding_truncation_strategies
    raise ValueError(
ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.
  0%|          | 0/9 [00:00<?, ?it/s]

JSON:

{"question": "Who wrote Charlie and the Chocolate Factory?", "answer": "Roald Dahl"}
{"question": "Name a few ways to treat constipation naturally.", "answer": "Exercise regularly, eat more fibers, and drink more water."}
{"question": "Where is the longest roller coaster located?", "answer": "Nagashima, Japan. The name of the coaster is Steel Dragon 2000."}
{"question": "Who murdered JFK?", "answer": "It is said to be Harvey Oswald."}
{"question": "What are the 11 herbs and spices that Colonel Sanders used in KFC?", "answer": "Nobody knows, as it's a secret."}
{"question": "Who wrote Les Miserables?", "answer": "Victor Hugo"}
{"question": "What is the Watergate Scandal?", "answer": "The Watergate scandal was a significant political controversy in the United States during the presidency of Richard Nixon from 1972 to 1974, ultimately resulting in Nixon's resignation. It originated from attempts by the Nixon administration to conceal its involvement in the June 17, 1972, break-in at the Democratic National Committee headquarters located in the Watergate Office Building in Washington, D.C."}
{"question": "What is Obama's most famous quote?", "answer": "'Yes we can!'"}
{"question": "Where did the 2008 Olympic take place?", "answer": "Beijing"}
{"question": "Lentils and Chickpeas are what kind of food?", "answer": "Beans"}
{"question": "Who was the disciple that Jesus loved?", "answer": "John"}
{"question": "Why did the Boston Tea Party happen?", "answer": "The colonists were unhappy with the tax and restrictions imposed by the British colonists."}
{"question": "What was the effect of the Boston Tea Party?", "answer": "The British imposed a new Intolerable Act, and tensions between the colonies and the British escalated."}
{"question": "Who conquered the Aztec Empire?", "answer": "Hernan Cortes"}
{"question": "What is the longest flight as of 2024?", "answer": "Singapore to New York, operated by Singapore Airlines."}
{"question": "Name a few infamous Roman dictators.", "answer": "Caligula, Nero, and Tiberius."}
{"question": "Where did the early Hungarians come from?", "answer": "They originated from the Uralic region as nomads, and then migrated to Central Europe's Carpathian basin."}
{"question": "Where is the fastest roller coaster located?", "answer": "Abu Dhabi, and the coaster is known as F1."}
{"question": "What are some popular painting in Uffizi Gallery?", "answer": "The Birth of Venus, Madonna of the Goldfinch, and Judith and Holofernes."}
{"question": "Who wrote A Christmas Carol and Oliver Twist?", "answer": "Charles Dickens"}

Suggestions on how I can fix this?

Need to fix ValueError: Asking to pad but the tokenizer does not have a padding token

User Panel Messages

Announcements