from google.colab import userdata
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
corpus_df=pd.read_csv("/content/samsum-train.csv", nrows=5000)
corpus_df.head()
sentence=corpus_df['text'][0]
nlp = spacy.load('en_core_web_sm')
sentence_nlp = nlp(sentence)
# POS tagging with Spacy
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type']).T
nltk.download('averaged_perceptron_tagger')
import nltk
nltk_pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag']).T
from nltk.tag import RegexpTagger
# define regex tag patterns
patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
(r'.*ould$', 'MD'), # modals
(r'.*\'s$', 'NN$'), # possessive nouns
(r'.*s$', 'NNS'), # plural nouns
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'.*', 'NN') # nouns (default) ...
]
rt = RegexpTagger(patterns)
rt.tag(nltk.word_tokenize(sentence))
nltk.download('treebank')
from nltk.corpus import treebank
data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger
from nltk.tag import TrigramTagger
ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)
print(ut.tag(nltk.word_tokenize(sentence)))
print(bt.tag(nltk.word_tokenize(sentence)))
print(tt.tag(nltk.word_tokenize(sentence)))

