728x90
In [21]:
import pandas as pd
import string
import nltk # natural language processing (자연어 처리)
from nltk.corpus import stopwords
import gensim # 자연어 처리: 토큰
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
In [22]:
echo_df = pd.read_csv('Echodot2_Reviews.csv', encoding='utf-8')
echo_df.head(2)
Out[22]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
In [23]:
echo_df.dropna(subset=['Review Text'], how='any', axis = 0, inplace=True)
In [24]:
# A function to remove punctuations
def remove_punc(text):
Test_removed_punc = [ char for char in text if char not in string.punctuation ]
Test_punc_join = ''.join(Test_removed_punc)
return Test_punc_join
In [25]:
# download stopwords (불용어) - 감성분석
# 그 자체로 의미가 없는 것들
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Out[25]:
True
In [26]:
# Add more stopwords : extend
stop_words = stopwords.words('english')
stop_words.extend(['amazon','Amazon','Echo','echo','device','Dot','dot'])
In [27]:
# Remove stopwords and remove short words(less than 2 char)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in stop_words and len(token) >= 1:
result.append(token)
return result
In [28]:
echo_df['reviews_unpunc']= echo_df['Review Text'].apply(remove_punc)
echo_df['reviews_unpunc_unstopwords'] = echo_df['reviews_unpunc'].apply(preprocess)
In [29]:
# join the words into a string
echo_df['Adj_text'] = echo_df['reviews_unpunc_unstopwords'].apply(lambda x: ' '.join(x))
In [30]:
echo_df.head()
Out[30]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | reviews_unpunc | reviews_unpunc_unstopwords | Adj_text | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Not great speakers | [great, speakers] | great speakers |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great little gagit | [great, little, gagit] | great little gagit |
2 | 5 | 9/8/2017 | Echo Dot | Awesome 👏🏽 | White | Awesome! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Awesome 👏🏽 | [awesome] | awesome |
3 | 5 | 10/19/2017 | Echo Dot | Love my Echo | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Love my Echo | [love] | love |
4 | 5 | 9/17/2017 | Echo Dot | Great device | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great device | [great] | great |
tokenization
In [31]:
# tokenization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() # instance
echo_CVec = vectorizer.fit_transform(echo_df['Adj_text'])
In [32]:
echo_CVec.shape
Out[32]:
(6852, 7632)
In [33]:
print(vectorizer.get_feature_names()[:50])
['aaa', 'aaaaaaaaaa', 'aardvarks', 'abba', 'abgn', 'abilities', 'abilitiesalexa', 'abilitiesi', 'ability', 'able', 'aboutbut', 'aboutmay', 'absolute', 'absolutely', 'absurd', 'abummer', 'ac', 'academy', 'accent', 'accents', 'accept', 'acceptable', 'accesable', 'access', 'accessary', 'accessed', 'accesses', 'accessibility', 'accessible', 'accessing', 'accessories', 'accessory', 'accidently', 'accommodating', 'accompany', 'accompanying', 'accomplish', 'accomplished', 'accomplishing', 'according', 'account', 'accountfirst', 'accounti', 'accounts', 'accountsnot', 'acct', 'accuracy', 'accurate', 'accurately', 'accustomed']
/usr/local/lib/python3.8/dist-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
In [34]:
print(echo_CVec.toarray())
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]
In [35]:
word_count_array = echo_CVec.toarray()
word_count_array.shape
Out[35]:
(6852, 7632)
In [36]:
# the first sample
word_count_array[0,:]
Out[36]:
array([0, 0, 0, ..., 0, 0, 0])
In [37]:
index = 500
plt.plot(word_count_array[index,:])
Out[37]:
[<matplotlib.lines.Line2D at 0x7fde3373e1c0>]
In [38]:
echo_df['Adj_text'][index]
Out[38]:
'love second purchase handy cute decided buy one bedroom sound good often listen music decided buy small doss soundbox xs speaker also incredible sound range love gadget would recommend anyone'
728x90
'Data Analytics with python > [Data Analysis]' 카테고리의 다른 글
[Text]S8_08_Word_Cloud (0) | 2023.01.21 |
---|---|
[Text]S8_07_Text_visualization (0) | 2023.01.21 |
[Text]S8_04_Text_cleaning(removing_punctuation) (0) | 2023.01.21 |
[Text]S8_03_Text_in_pandas_2 (0) | 2023.01.21 |
[Text]S8_02_Text_in_pandas_1 (0) | 2023.01.21 |
댓글