#Exp2: chisquare test
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
import string
stop_words=set(stopwords.words('english'))

        
sentences = [
    "I love studying data science.",
    "Data science is an interesting field.",
    "Science requires data for analysis.",
    "Data is key in modern science.",
    "Data science helps in business decision-making."
]

bigram_count={}
for sentence in sentences:
    sentence=sentence.lower()
    tokens=word_tokenize(sentence)
    tokens_new=[token for token in tokens if token not in stop_words and token not in string.punctuation]
    bigram_list=list(bigrams(tokens_new))
    for bigram in bigram_list:
        bigram_count[bigram]=bigram_count.get(bigram,0)+1


word1=input("Enter the word1:")
word2=input("Enter the word2:")
# contingency matrix
C = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

# Updating contingency matrix based on word1 and word2
for units in bigram_count:
    if units[0] == word1 and units[1] == word2:
        C[0][0] += bigram_count[units]  # word1 and word2
    elif units[0] == word1 and units[1] != word2:
        C[0][1] += bigram_count[units]  # word1 and not word2
    elif units[0] != word1 and units[1] == word2:
        C[1][0] += bigram_count[units]  # not word1 and word2
    else:
        C[1][1] += bigram_count[units]  # not word1 and not word2

# total matrix
# Updating contingency matrix based on word1 and word2
C[0][2] = C[0][0] + C[0][1]  
C[1][2] = C[1][0] + C[1][1] 
C[2][0] = C[0][0] + C[1][0]  
C[2][1] = C[0][1] + C[1][1] 
tot = C[2][0] + C[2][1]

print("Contingency matrix:")
for row in C:
    print(" ".join(str(val) for val in row))

# expected matrix
E = [[0, 0], [0, 0]]

# Calculate expected values based on contingency matrix and total occurrences
E[0][0] = (C[0][2] * C[2][0]) / tot  # expected occurrences of word1 and word2
E[0][1] = (C[0][2] * C[2][1]) / tot  # expected occurrences of word1 and not word2
E[1][0] = (C[1][2] * C[2][0]) / tot  # expected occurrences of not word1 and word2
E[1][1] = (C[1][2] * C[2][1]) / tot  # expected occurrences of neither word1 nor word2

print("Expected matrix:")
for row in E:
    print(" ".join(f"{val:.2f}" for val in row))

obs_mat = [C[0][0], C[0][1], C[1][0], C[1][1]]
exp_mat = [E[0][0], E[0][1], E[1][0], E[1][1]]

chi2test=0
for i in range(4):
    chi2test+=(obs_mat[i]-exp_mat[i])**2/exp_mat[i] #summation of O-E whole square by E

cric_val=float(input("Enter critical value:"))

if(chi2test>cric_val):
    print("Reject H0")
else:
    print("Accept H0")

------------------------------------,+
2-(b) CHI-Square,bigram and collaction 

import nltk
from nltk.corpus import stopwords
print(stopwords.words('english'))

from nltk.tokenize import word_tokenize
text='I study for NLP.'
word_tokenize(text)

import string
f=open("/content/SAMPLETEXT.txt","r")
text=f.read()
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
print(word_tokens)

word_tokens = [word for word in word_tokens if (word not in stop_words and word not in string.punctuation)]
print(word_tokens)


cv=input("enter the critical value : ")
def collocation(w1,w2):
    nl=list()
    N=len(word_tokens)
    pw1=word_tokens.count(w1)
    pw2=word_tokens.count(w2)
    md=((pw1/N)*(pw2/N))
    j=0
    for i in range(len(word_tokens)-1):
        if(word_tokens[i]==w1 and word_tokens[i+1]==w2):
            j=j+1
    pw12=j
    x=pw12/N
    s2=x
    t=(x-md)/(x/N)**0.5
    if(float(t) > float(cv)):
        print("hypothesis rejected thus the given words( ",w1," ",w2," ) form a collocation")
        print(t)
        nl.append(w1)
        nl.append(w2)
        nl.append(t)
    return nl
fcol=[]
for i in range(len(word_tokens)-1):
    w1=word_tokens[i]
    w2=word_tokens[i+1]
    fcol.append(collocation(w1,w2))
for i in fcol:
    if(len(i) > 1):
        if(fcol.count(i)>1):
            fcol.remove(i)
    else:
        fcol.remove(i)
for i in fcol:
    if(len(i) > 1):
        print(i)

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def collocation1(w1, w2, word_tokens, cv):
    nl = []
    N = len(word_tokens)
    pw1 = word_tokens.count(w1)
    pw2 = word_tokens.count(w2)
    Ew1w2 = ((pw1 * pw2) / N)
    Ew1nw2 = ((pw1 * (N - pw2)) / N)
    Enw1w2 = (((N - pw1) * pw2) / N)
    Enw1nw2 = (((N - pw1) * (N - pw2)) / N)
    j = 0
    for i in range(len(word_tokens) - 1):
        if word_tokens[i] == w1 and word_tokens[i + 1] == w2:
            j += 1
    pw12 = j
    Ow1w2 = pw12
    Ow1nw2 = pw1 - pw12
    Onw1w2 = pw2 - pw12
    Onw1nw2 = N - (Ow1w2 + Ow1nw2 + Onw1w2)  # Corrected calculation
    X = 0.0
    if Ew1w2 != 0:
        X += (((Ow1w2 - Ew1w2) ** 2) / Ew1w2)
    if Ew1nw2 != 0:
        X += (((Ow1nw2 - Ew1nw2) ** 2) / Ew1nw2)
    if Enw1w2 != 0:
        X += (((Onw1w2 - Enw1w2) ** 2) / Enw1w2)
    if Enw1nw2 != 0:
        X += (((Onw1nw2 - Enw1nw2) ** 2) / Enw1nw2)
    if X > float(cv):
        print("Hypothesis rejected: the words ('", w1, "', '", w2, "') form a collocation.")
        print("Chi-squared:", X) 
        nl.append(w1)
        nl.append(w2)
        nl.append(X)
    return nl

def find_collocations(text, cv=1, remove_stopwords=True):
    word_tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    word_tokens = [w for w in word_tokens if w not in stop_words and w.isalnum()] # Also remove punctuation
    fcol = []
    for i in range(len(word_tokens) - 1):
        w1 = word_tokens[i]
        w2 = word_tokens[i + 1]
        fcol.append(collocation1(w1, w2, word_tokens, cv))
    filtered_fcol = []
    for i in fcol:
        if i and i not in filtered_fcol:
            filtered_fcol.append(i)
    return filtered_fcol
text = "This is a strong coffee. It is very strong, and I like strong coffee.  Make coffee and make decision.  I had a strong feeling about that."
collocations_with_stopwords = find_collocations(text, remove_stopwords=False)
print("Collocations (with stopwords):", collocations_with_stopwords)

