Metadata-Version: 2.1
Name: shimpiproductions
Version: 0.2
Summary: import pandas as pd
Home-page: https://upload.pypi.org/legacy/
Author: SHIMPI PRODUCTIONS
Author-email: sarveshshimpi18@gmail.com
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Description-Content-Type: text/markdown
License-File: License.txt
Requires-Dist: pandas
Requires-Dist: scikit-learn
Requires-Dist: numpy
Requires-Dist: matplotlib
Requires-Dist: apyori


import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import scale, MinMaxScaler,OneHotEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from apyori import apriori
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

df=pd.read_csv("housing.csv")
print(df)
print(df.head())
print(df.tail())
print(df.max())
print(df.min())


median_house_value_column = df['median_house_value']

print(median_house_value_column.var())
print(median_house_value_column.median())
print(median_house_value_column.mode())
print(median_house_value_column.mean())
print(median_house_value_column.std())
print(median_house_value_column.count())
print(median_house_value_column.describe())
print(median_house_value_column.dtypes)
df = df.dropna()
print(df)

x_array = np.array(df['median_house_value']).reshape(-1, 1)
normalized_arr = preprocessing.normalize(x_array)
print(normalized_arr)


numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = list(set(df.columns) - set(numeric_columns))
df_encoded = pd.get_dummies(df, columns=categorical_columns)
scaler = MinMaxScaler(feature_range=(0, 2))
df_encoded[numeric_columns] = scaler.fit_transform(df_encoded[numeric_columns])
print(df_encoded.head())

pca = PCA()
pca.fit(df_encoded)
print(pca.explained_variance_ratio_)

df1 = pd.read_csv("student.csv",header=0)
df2 = pd.read_csv("mark.csv",header=0)
df_stu=pd.merge(df1,df2,on='Student_id')

print(df_stu.head())
print(df_stu.shape)


store_data = pd.read_csv("store_data.csv", header=None)
print(store_data.head())
records=[]

for i in range(0,7501):
    records.append([str(store_data.values[i,j] ) for j in range(0,20)])

association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)
print(association_results)

print("There are {} Relation derived.".format(len(association_results)))
for i in range(0, len(association_results)):
    print(association_results[i][0])

for item in association_results:
    # first index of the inner list
    # Contains base item and add item
    pair = item[0]
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    # second index of the inner list
    print("Support: " + str(item[1]))

    # third index of the list located at 0th
    # of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")


X = np.array([[5,3], [10,15], [15,12], [24,10], [30,45], [85,70], [71,80], [60,78], [55,52], [80,91]])
kmeans = KMeans(n_clusters=2)  # k=2      0 , 1
kmeans.fit(X)

print(kmeans.cluster_centers_)
plt.scatter(X[:,0],X[:,1], label='True Position')
plt.show()
plt.scatter(X[:,0],X[:,1], c=kmeans.labels_, cmap='rainbow')
plt.show()
plt.scatter(X[:,0], X[:,1], c=kmeans.labels_, cmap='rainbow') 
plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
plt.show()

df_kmeans = pd.get_dummies(df, drop_first=True) #to remove redundandant column
print(df_kmeans.head())

scaler = StandardScaler()
scaled_df_kmeans = scaler.fit_transform(df_kmeans)

Y = df['housing_median_age']
X = df.drop(columns=['ocean_proximity'])

# Applying KMeans Clustering to the housing data
kmeans_model = KMeans(n_clusters=3)
clusters = kmeans_model.fit_predict(X)
X.insert(X.columns.get_loc("housing_median_age"), "Cluster", clusters)

# Displaying the clusters
print(X["Cluster"].value_counts())

# Additional exploratory analysis for determining the number of clusters
ssd = []
for k in range(2, 9):
    kmeans_model = KMeans(n_clusters=k)
    kmeans_model.fit(X)
    ssd.append(kmeans_model.inertia_)

# Plotting SSD for different K values
plt.figure(figsize=(6, 4), dpi=100)
plt.plot(range(2, 9), ssd, color="green", marker="o")
plt.xlabel("Number of clusters (K)")
plt.ylabel("SSD for K")
plt.show()

# Splitting the data for Gaussian Naive Bayes classification
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

# Gaussian Naive Bayes classifier
naive = GaussianNB()
naive_model = naive.fit(x_train, y_train)

# Predictions and evaluation
y_pred = naive_model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print('The accuracy of the model is: ', accuracy)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


