← BACK

NLP(5) Integer Encoding

Posting information

Posting Date : 04-08-2024
Last Edit : 05-08-2024
Writer : KWON Bongjae
Posting detail

#01_05 : Integer Encoding

  

#1) using dictionary

from nltk.tokenize import sent_tokenize

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

  

raw_text = """Edward Bear, known to his friends as Winnie-the-Pooh, or Pooh for

short, was walking through the forest one day, humming proudly to

himself. Pooh had made up a little hum that very morning, as Pooh was doing

his Stoutness Exercises in front of the glass: _Tra-la-la, tra-la-la_,

as he stretched up as high as he could go, and then _Tra-la-la,

tra-la--oh, help!--la_, as he tried to reach his toes."""

  

sentences = sent_tokenize(raw_text)

print(sentences)

#['Edward Bear, known to his friends as Winnie-the-Pooh, or Pooh for\nshort, was walking through the forest one day, humming proudly to\nhimself.', 'Pooh had made up a little hum that very morning, as Pooh was doing\nhis Stoutness Exercises in front of the glass: _Tra-la-la, tra-la-la_,\nas he stretched up as high as he could go, and then _Tra-la-la,\ntra-la--oh, help!--la_, as he tried to reach his toes.']

  

vocab = {} #dictionary

preprocessed_sentences = [] #list

stop_words = set(stopwords.words('english'))

  

for sentence in sentences :

tokenized_sentence = word_tokenize(sentence)

result = []

  

for word in tokenized_sentence :

word = word.lower()

  

if word not in stop_words:

if len(word) > 2:

result.append(word)

if word not in vocab:

vocab[word] = 0

vocab[word] += 1

preprocessed_sentences.append(result)

  

print(preprocessed_sentences)

#'edward', 'bear', 'known', 'friends', 'winnie-the-pooh', 'pooh', 'short', 'walking', 'forest', 'one', 'day', 'humming', 'proudly'], ['pooh', 'made', 'little', 'hum', 'morning', 'pooh', 'stoutness', 'exercises', 'front', 'glass', '_tra-la-la', 'tra-la-la_', 'stretched', 'high', 'could', '_tra-la-la', 'tra-la', 'help', 'la_', 'tried', 'reach', 'toes'  Private or Broken Links
The page you're looking for is either not available or private! 
 

  

print('Vocab :', vocab)

#Vocab : {'edward': 1, 'bear': 1, 'known': 1, 'friends': 1, 'winnie-the-pooh': 1, 'pooh': 3, 'short': 1, 'walking': 1, 'forest': 1, 'one': 1, 'day': 1, 'humming': 1, 'proudly': 1, 'made': 1, 'little': 1, 'hum': 1, 'morning': 1, 'stoutness': 1, 'exercises': 1, 'front': 1, 'glass': 1, '_tra-la-la': 2, 'tra-la-la_': 1, 'stretched': 1, 'high': 1, 'could': 1, 'tra-la': 1, 'help': 1, 'la_': 1, 'tried': 1, 'reach': 1, 'toes': 1}

  

print(vocab["pooh"])

#3

  

vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True)

print(vocab_sorted)

#[('pooh', 3), ('_tra-la-la', 2), ('edward', 1), ('bear', 1), ('known', 1), ('friends', 1), ('winnie-the-pooh', 1), ('short', 1), ('walking', 1), ('forest', 1), ('one', 1), ('day', 1), ('humming', 1), ('proudly', 1), ('made', 1), ('little', 1), ('hum', 1), ('morning', 1), ('stoutness', 1), ('exercises', 1), ('front', 1), ('glass', 1), ('tra-la-la_', 1), ('stretched', 1), ('high', 1), ('could', 1), ('tra-la', 1), ('help', 1), ('la_', 1), ('tried', 1), ('reach', 1), ('toes', 1)]

  

word_to_index = {}

i = 0

for (word, frequency) in vocab_sorted:

if frequency > 1 :

i = i + 1

word_to_index[word] = i

  

print(word_to_index)

#{'pooh': 1, '_tra-la-la': 2}

  

vocab_size = 1

words_frequency = [word for word, index in word_to_index.items() if index >= vocab_size + 1 ]

  

for w in words_frequency:

del word_to_index[w]

print(word_to_index)

#{'pooh': 1}

  
  

word_to_index['OOV'] = len(word_to_index) + 1 #Out-Of-Vocabulary

print(word_to_index)

#{'pooh': 1, 'OOV': 2}

  

encoded_sentences = []

for sentence in preprocessed_sentences:

encoded_sentences = []

for word in sentence:

try:

encoded_sentences.append(word_to_index[word])

except KeyError:

encoded_sentences.append(word_to_index['OOV'])

  
  

encoded_sentences.append(encoded_sentences)

print(encoded_sentences)

#[1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, [...]] ***?

  
  

#2) Counter

  

from collections import Counter

print(preprocessed_sentences)

#'edward', 'bear', 'known', 'friends', 'winnie-the-pooh', 'pooh', 'short', 'walking', 'forest', 'one', 'day', 'humming', 'proudly'], ['pooh', 'made', 'little', 'hum', 'morning', 'pooh', 'stoutness', 'exercises', 'front', 'glass', '_tra-la-la', 'tra-la-la_', 'stretched', 'high', 'could', '_tra-la-la', 'tra-la', 'help', 'la_', 'tried', 'reach', 'toes'  Private or Broken Links
The page you're looking for is either not available or private! 
 

  

all_words_list = sum(preprocessed_sentences, [])

print(all_words_list)

#['edward', 'bear', 'known', 'friends', 'winnie-the-pooh', 'pooh', 'short', 'walking', 'forest', 'one', 'day', 'humming', 'proudly', 'pooh', 'made', 'little', 'hum', 'morning', 'pooh', 'stoutness', 'exercises', 'front', 'glass', '_tra-la-la', 'tra-la-la_', 'stretched', 'high', 'could', '_tra-la-la', 'tra-la', 'help', 'la_', 'tried', 'reach', 'toes']

  

vocab = Counter(all_words_list)

print(vocab)

#Counter({'pooh': 3, '_tra-la-la': 2, 'edward': 1, 'bear': 1, 'known': 1, 'friends': 1, 'winnie-the-pooh': 1, 'short': 1, 'walking': 1, 'forest': 1, 'one': 1, 'day': 1, 'humming': 1, 'proudly': 1, 'made': 1, 'little': 1, 'hum': 1, 'morning': 1, 'stoutness': 1, 'exercises': 1, 'front': 1, 'glass': 1, 'tra-la-la_': 1, 'stretched': 1, 'high': 1, 'could': 1, 'tra-la': 1, 'help': 1, 'la_': 1, 'tried': 1, 'reach': 1, 'toes': 1})

  

print(vocab["pooh"])

#3

  

vocab_size = 3

vocab = vocab.most_common(vocab_size)

print(vocab)

#[('pooh', 3), ('_tra-la-la', 2), ('edward', 1)]

  

word_to_index = {}

i = 0

for (word, frequency) in vocab :

i = i + 1

word_to_index[word] = i

  

print(word_to_index)

#{'pooh': 1, '_tra-la-la': 2, 'edward': 3}

  
  

#FreqDist in NLTK

  

from nltk import FreqDist

import numpy as np

  

vocab = FreqDist(np.hstack(preprocessed_sentences))

  

print(vocab["pooh"])

#3

  

vocab_size = 3

vocab = vocab.most_common(vocab_size)

print(vocab)

#[('pooh', 3), ('_tra-la-la', 2), ('edward', 1)]

  

word_to_index = {word[0] : index + 1 for index, word in enumerate(vocab)}

print(word_to_index)

#{'pooh': 1, '_tra-la-la': 2, 'edward': 3}

  
  

#enumerate

test_input = ['a', 'b', 'c', 'd', 'e', 'f', 'g']

for index, value in enumerate(test_input):

print("value : {}, index: {}".format(value, index))

"""

value : a, index: 0

value : b, index: 1

value : c, index: 2

value : d, index: 3

value : e, index: 4

value : f, index: 5

value : g, index: 6

"""

  
  

#Text preprocessing in keras

  

from tensorflow.keras.preprocessing.text import Tokenizer

  

tokenizer = Tokenizer()

tokenizer.fit_on_texts(preprocessed_sentences)

print(tokenizer.word_index)

#{'pooh': 1, '_tra-la-la': 2, 'edward': 3, 'bear': 4, 'known': 5, 'friends': 6, 'winnie-the-pooh': 7, 'short': 8, 'walking': 9, 'forest': 10, 'one': 11, 'day': 12, 'humming': 13, 'proudly': 14, 'made': 15, 'little': 16, 'hum': 17, 'morning': 18, 'stoutness': 19, 'exercises': 20, 'front': 21, 'glass': 22, 'tra-la-la_': 23, 'stretched': 24, 'high': 25, 'could': 26, 'tra-la': 27, 'help': 28, 'la_': 29, 'tried': 30, 'reach': 31, 'toes': 32}

  

print(tokenizer.word_counts)

#OrderedDict([('edward', 1), ('bear', 1), ('known', 1), ('friends', 1), ('winnie-the-pooh', 1), ('pooh', 3), ('short', 1), ('walking', 1), ('forest', 1), ('one', 1), ('day', 1), ('humming', 1), ('proudly', 1), ('made', 1), ('little', 1), ('hum', 1), ('morning', 1), ('stoutness', 1), ('exercises', 1), ('front', 1), ('glass', 1), ('_tra-la-la', 2), ('tra-la-la_', 1), ('stretched', 1), ('high', 1), ('could', 1), ('tra-la', 1), ('help', 1), ('la_', 1), ('tried', 1), ('reach', 1), ('toes', 1)])

  

print(tokenizer.texts_to_sequences(preprocessed_sentences))

#3, 4, 5, 6, 7, 1, 8, 9, 10, 11, 12, 13, 14], [1, 15, 16, 17, 18, 1, 19, 20, 21, 22, 2, 23, 24, 25, 26, 2, 27, 28, 29, 30, 31, 32  Private or Broken Links
The page you're looking for is either not available or private! 
 

vocab_size = 3

tokenizer = Tokenizer(num_words = vocab_size +1)

tokenizer.fit_on_texts(preprocessed_sentences)

print(tokenizer.word_index)

#{'pooh': 1, '_tra-la-la': 2, 'edward': 3, 'bear': 4, 'known': 5, 'friends': 6, 'winnie-the-pooh': 7, 'short': 8, 'walking': 9, 'forest': 10, 'one': 11, 'day': 12, 'humming': 13, 'proudly': 14, 'made': 15, 'little': 16, 'hum': 17, 'morning': 18, 'stoutness': 19, 'exercises': 20, 'front': 21, 'glass': 22, 'tra-la-la_': 23, 'stretched': 24, 'high': 25, 'could': 26, 'tra-la': 27, 'help': 28, 'la_': 29, 'tried': 30, 'reach': 31, 'toes': 32}

  

print(tokenizer.texts_to_sequences(preprocessed_sentences))

#3, 1], [1, 1, 2, 2  Private or Broken Links
The page you're looking for is either not available or private!
KWON • Matthew 10:8 [···] Without cost you have received; without cost you are to give.
← BACK

NLP(5) Integer Encoding

Posting information

Posting detail

Links to this note

03. NLP