#EDA
-------------------------------
1)Initial Data Overview
--------------------------------
# Load dataset
import pandas as pd
df = pd.read_csv('your_dataset.csv')

#Read Different File Formats and add Header
import pandas as pd

# Define column names
headers = ["Column1", "Column2", "Column3"]

# Read file and apply headers
df = pd.read_csv("file.txt", delimiter="\t", names=headers)

# Save to CSV with headers
df.to_csv("output.csv", index=False)



# Basic information
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nData Types:")
print(df.dtypes)
print("\nDataset Info:")
print(df.info())

--------------------------
2) Missing Values Analysis 
----------------------------
# Missing values summary
print("Missing Values Summary:")
print(df.isnull().sum())

# Visualizing missing values
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

-------------------------
3) Statistical Summary
-------------------------
# Descriptive statistics
print("Descriptive Statistics:")
print(df.describe(include='all'))

# For numerical columns
if len(df.select_dtypes(include=['int64','float64']).columns) > 0:
    print("\nNumerical Columns Statistics:")
    print(df.describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]))

-------------------------
4)  Univariate Analysis
-------------------------

# Numerical columns distribution
num_cols = df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    plt.figure(figsize=(8,4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()
    
    plt.figure(figsize=(8,4))
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    plt.show()

# Categorical columns distribution
cat_cols = df.select_dtypes(include=['object','category','bool']).columns
for col in cat_cols:
    plt.figure(figsize=(8,4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
    plt.show()

-------------------------------
5) Bivariate/Multivariate Analysis
---------------------------------

# Correlation matrix for numerical variables
if len(num_cols) > 1:
    plt.figure(figsize=(10,8))
    corr = df[num_cols].corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.show()

# Pairplot for numerical variables (sample if dataset is large)
if len(num_cols) > 1:
    sample_df = df.sample(min(1000, len(df))) if len(df) > 1000 else df
    sns.pairplot(sample_df[num_cols])
    plt.show()

# Categorical vs Numerical analysis
if len(num_cols) > 0 and len(cat_cols) > 0:
    for num_col in num_cols:
        for cat_col in cat_cols:
            if len(df[cat_col].unique()) < 10:  # Avoid columns with too many categories
                plt.figure(figsize=(10,6))
                sns.boxplot(x=cat_col, y=num_col, data=df)
                plt.title(f'{num_col} by {cat_col}')
                plt.xticks(rotation=45)
                plt.show()

---------------------
6) Outlier Detection
---------------------

# Z-score method for numerical columns
from scipy import stats
import numpy as np

for col in num_cols:
    z = np.abs(stats.zscore(df[col]))
    print(f"Outliers in {col}: {len(np.where(z > 3)[0])}")
    
    # IQR method
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < (Q1 - 1.5*IQR)) | (df[col] > (Q3 + 1.5*IQR))]
    print(f"IQR outliers in {col}: {len(outliers)}")

--------------------------------------
7) Time Series Analysis (if applicable)
-------------------------------------
# Check for datetime columns
date_cols = df.select_dtypes(include=['datetime64']).columns
for col in date_cols:
    df[col] = pd.to_datetime(df[col])
    plt.figure(figsize=(12,6))
    df.set_index(col).resample('D').count().plot()  # Change resample frequency as needed
    plt.title(f'Trend over time ({col})')
    plt.show()

------------------------------------
8. Unique Values and Cardinality
----------------------------------
# Check unique values in each column
print("Unique Values Count:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()}")
    
# High cardinality columns
high_cardinality = [col for col in df.columns if df[col].nunique() > 50]
print("\nHigh Cardinality Columns (>50 unique values):", high_cardinality)

--------------------
9. Save EDA Report
--------------------

# Save basic statistics to CSV
df.describe(include='all').to_csv('eda_summary_statistics.csv')

# Save missing values info
df.isnull().sum().to_frame('missing_values').to_csv('missing_values_summary.csv')










