# Preprocess and clean datasets for Generative AI
applications using Python libraries such as Pandas
and NumPy. Handle missing data, normalize
numerical features, and encode categorical
variables for a small dataset (e.g., text or image
metadata).


import pandas as pd
import numpy as np

def load_sample_data():
    data={
        'image_id':['img1','img2','img3','img4','img5'],
        'category':['cat','dog',np.nan,'cat','dog'],
        'size_kb':[1200,850,900,np.nan,1150],
        'resolution':['1024x768','800x600','1024x768','640x480',np.nan],
        'label':['outdoor','indoor','indoor',np.nan,'outdoor']
    }

    df=pd.DataFrame(data)
    print(df)
    return df

def handle_missing_data(df):
    df['category']=df['category'].fillna('unknown')
    df['label']=df['label'].fillna('unknown')
    median_size=df['size_kb'].median()
    df['size_kb']=df['size_kb'].fillna(median_size)
    mode_res=df['resolution'].mode()[0]
    df['resolution']=df['resolution'].fillna(mode_res)
    return df

def normalize_numerical(df,cols):
    for col in cols:
        min_val=df[col].min()
        max_val=df[col].max()
        df[col]=(df[col]-min_val)/(max_val-min_val)
    return df

def encode_categorical(df,cols):
    df_encoded=pd.get_dummies(df,columns=cols,drop_first=True)
    return df_encoded

def run_pipeline():
    df=load_sample_data()
    print("OriginalData:")
    print(df,"\n")
    df=handle_missing_data(df)
    print("AfterHandlingMissingData:")
    print(df,"\n")
    df=normalize_numerical(df,cols=['size_kb'])
    print("AfterNormalizingNumericalFeatures:")
    print(df,"\n")
    df_encoded=encode_categorical(df,cols=['category','resolution','label'])
    print("AfterEncodingCategoricalVariables:")
    print(df_encoded)

if __name__=="__main__":
    run_pipeline()
