# ex - 10 LSTM 
---------------------

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

data = pd.read_csv('ArticlesApril2017.csv')
data.head(1)

data['headline'] = data['headline'].apply(str.lower)
headlines = data['headline'].values

headlines[:5]

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(headlines) # Create the vocabulary
sequences = tokenizer.texts_to_sequences(headlines) # Use the vocabulary to convert text to seqs

sequences[:5]

# Prepare training data
input_sequences = []
X = []
y = []
for i in sequences:
        X.append(i[:-1]) # Feature is everything except last element
        y.append(i[-1]) # Target is last element
  

X[:3], y[:3]

# Padding: [23,45] to [0,0,23,45]
X = pad_sequences(X) 
y = np.array(y)

X[0],"LABEL",y[0] 

vocab_size = len(tokenizer.word_index) + 1

# LSTM accepts 3D input: No. of rows, No. of timesteps (words), No. of features per timestep (1 word so 1)
lstm_shape = (X.shape[1],1) # to tell LSTM
X_forLSTM = X.reshape(X.shape[0], X.shape[1], 1) # reshaping X shape

X_forLSTM[0]

# LSTM model
model = Sequential([
    LSTM(100, input_shape=lstm_shape),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.01), 
    metrics=['accuracy'], 
    loss='sparse_categorical_crossentropy' # For multiple categories
)

model.fit(X_forLSTM, y, epochs=20, verbose=False)

num_words_to_generate = 5  # Generate 5 new words
text = "The cat"

for _ in range(num_words_to_generate):
    # Turn the text into sequences [123,456,23]
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Pad them [0,0,0,123,456,23]
    padded_sequence = pad_sequences([token_list], maxlen=X.shape[1])

    # Get the prediction (reshape: 1 row, X.shape[1] timesteps and 1 feature per timstep)
    predicted = model.predict(padded_sequence.reshape(1, X.shape[1], 1))

    # Find which word has this index
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    text += " " + predicted_word

print("\nFinal generated text:", text)
