# Apply model optimization techniques such as
pruning or quantization to a small pre-trained
generative model (e.g., DistilBERT). Test inference
speed and memory usage on a local machine.



import torch, time, psutil
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load pre-trained DistilBERT
model_name = "distilbert-base-uncased"
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Sample input
text = "Artificial intelligence is transforming the world."
inputs = tok(text, return_tensors="pt")

# Measure original inference time
t1 = time.time(); _ = model(**inputs); t2 = time.time()
print(f"Original time: {t2 - t1:.4f}s | Memory: {psutil.Process().memory_info().rss/1e6:.2f} MB")

# Quantize model
q_model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

# Measure optimized inference time
t1 = time.time(); _ = q_model(**inputs); t2 = time.time()
print(f"Quantized time: {t2 - t1:.4f}s | Memory: {psutil.Process().memory_info().rss/1e6:.2f}")