DataParallel is Supported for XPU? #707

yash3056 · 2024-09-17T17:22:36Z

Describe the issue

I am facing error's with DataParallel.

alexsin368 · 2024-09-19T05:41:28Z

Hi @yash3056 please describe your issue in detail and provide the code and steps to reproduce it.

gujinghui · 2024-09-19T07:10:21Z

The DP should be not fully supported by XPU for now.
May I know why the DP is needed in your case, instead of DDP?
I remember, DP will be obsoleted by PyTorch on GPU.

yash3056 · 2024-09-19T08:18:44Z

@yash3056

The DP should be not fully supported by XPU for now. May I know why the DP is needed in your case, instead of DDP? I remember, DP will be obsoleted by PyTorch on GPU.

I wanted confirmation that DP is not supported. I am also facing error with DDP
I am facing engine error with XPU

Here is the code in which I am facing engine error

%%

#!pip install accelerate==1.0.0rc1 datasets

%%

from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from accelerate import Accelerator
import torch
from sklearn.metrics import accuracy_score

Load IMDB dataset

dataset = load_dataset("imdb")

Initialize the BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Tokenize the data

def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

Tokenize the train and test dataset

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

Define DataLoader for batching

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

Load pre-trained BERT model with a classification head

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Optimizer

optimizer = AdamW(model.parameters(), lr=5e-5)

Initialize Accelerator

accelerator = Accelerator()
device = accelerator.device
print(device)

Move model and optimizer to the appropriate device

model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(
model, optimizer, train_dataloader, test_dataloader
)

%%

from tqdm.auto import tqdm

def train(model, dataloader, optimizer, accelerator):
model.train()
total_loss = 0

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Training")

for batch in loop:
    # Forward pass
    outputs = model(**batch)
    loss = outputs.loss

    # Backward pass
    accelerator.backward(loss)

    # Optimization step
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss.item()

    # Update tqdm description with the current loss
    loop.set_description(f"Training Loss: {loss.item():.4f}")

avg_loss = total_loss / len(dataloader)
return avg_loss

%%

def evaluate(model, dataloader, accelerator):
model.eval()
predictions, labels = [], []

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Evaluating")

with torch.no_grad():
    for batch in loop:
        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        predictions.extend(accelerator.gather(preds).cpu().numpy())
        labels.extend(accelerator.gather(batch['labels']).cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(labels, predictions)
return accuracy

%%

def train(model, dataloader, optimizer, accelerator):
model.train()
total_loss = 0

# Use tqdm for progress bar
loop = tqdm(dataloader, leave=True, desc="Training")

for batch in loop:
    # Forward pass
    # Only pass input_ids and attention_mask to the model
    outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
    loss = outputs.loss

    # Backward pass
    accelerator.backward(loss)

    # Optimization step
    optimizer.step()
    optimizer.zero_grad()

    total_loss += loss.item()

    # Update tqdm description with the current loss
    loop.set_description(f"Training Loss: {loss.item():.4f}")

avg_loss = total_loss / len(dataloader)
return avg_loss

%%

epochs = 3

for epoch in range(epochs):
# Train the model
avg_train_loss = train(model, train_dataloader, optimizer, accelerator)

# Evaluate the model
accuracy = evaluate(model, test_dataloader, accelerator)

print(f"Epoch {epoch+1}/{epochs}")
print(f"Training Loss: {avg_train_loss:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")

here is the error

[WARNING] Failed to create Level Zero tracer: 2013265921
{
"name": "RuntimeError",
"message": "could not create an engine",
"stack": "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[6], line 5
1 epochs = 3
3 for epoch in range(epochs):
4 # Train the model
----> 5 avg_train_loss = train(model, train_dataloader, optimizer, accelerator)
7 # Evaluate the model
8 accuracy = evaluate(model, test_dataloader, accelerator)

Cell In[5], line 11, in train(model, dataloader, optimizer, accelerator)
6 loop = tqdm(dataloader, leave=True, desc="Training")
8 for batch in loop:
9 # Forward pass
10 # Only pass input_ids and attention_mask to the model
---> 11 outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['label'])
12 loss = outputs.loss
14 # Backward pass