728x90
In [3]:
import pandas as pd
import string
import nltk # natural language processing (자연어 처리)
from nltk.corpus import stopwords
import gensim # 자연어 처리 중 토큰화
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
import seaborn as sns
In [4]:
echo_df = pd.read_csv('Echodot2_Reviews.csv', encoding='utf-8')
echo_df.head(2)
Out[4]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
In [5]:
echo_df.dropna(subset=['Review Text'], how='any', axis = 0, inplace=True)
In [6]:
# A function to remove punctuations
def remove_punc(text):
Test_removed_punc = [ char for char in text if char not in string.punctuation ]
Test_punc_join = ''.join(Test_removed_punc)
return Test_punc_join
In [7]:
# download stopwords (불용어) - 감성분석
# 그 자체로 의미가 없는 것들
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
Out[7]:
True
In [8]:
# Add more stopwords : extend
stop_words = stopwords.words('english')
stop_words.extend(['amazon','Amazon','Echo','echo','device','Dot','dot'])
In [9]:
# Remove stopwords and remove short words(less than 2 char)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in stop_words and len(token) >= 1:
result.append(token)
return result
In [10]:
echo_df['reviews_unpunc']= echo_df['Review Text'].apply(remove_punc)
echo_df['reviews_unpunc_unstopwords'] = echo_df['reviews_unpunc'].apply(preprocess)
In [11]:
# join the words into a string
echo_df['Adj_text'] = echo_df['reviews_unpunc_unstopwords'].apply(lambda x: ' '.join(x))
In [12]:
# feeback_column
def feedback_class(row):
if '4' in str(row) or '5' in str(row):
return 1
else:
return 0
In [13]:
echo_df['feedback'] = echo_df['Rating'].apply(feedback_class)
echo_df.head()
Out[13]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | reviews_unpunc | reviews_unpunc_unstopwords | Adj_text | feedback | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Not great speakers | [great, speakers] | great speakers | 0 |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great little gagit | [great, little, gagit] | great little gagit | 1 |
2 | 5 | 9/8/2017 | Echo Dot | Awesome 👏🏽 | White | Awesome! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Awesome 👏🏽 | [awesome] | awesome | 1 |
3 | 5 | 10/19/2017 | Echo Dot | Love my Echo | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Love my Echo | [love] | love | 1 |
4 | 5 | 9/17/2017 | Echo Dot | Great device | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great device | [great] | great | 1 |
wordcloud
In [14]:
sentences = echo_df['Adj_text'].tolist()
sentences[0:5]
Out[14]:
['great speakers', 'great little gagit', 'awesome', 'love', 'great']
In [15]:
len(sentences)
Out[15]:
6852
In [16]:
# 모든 문장을 하나의 리스트에 모음
# print(sentences)
In [17]:
# Join all elements in the list into one massive string
# 리스트에 모든 문장을 한 문장으로 뭉침
one_string = ' '.join(sentences)
# one_string
In [18]:
# the words which mentioning more frequently
from wordcloud import WordCloud
plt.figure(figsize=(20,10))
plt.imshow(WordCloud().generate(one_string))
Out[18]:
<matplotlib.image.AxesImage at 0x7f79a6a1d4c0>
In [19]:
# positive
positive_df = echo_df[echo_df['Rating']==5]
positive_df.shape
Out[19]:
(4382, 14)
In [20]:
sentences = positive_df['Adj_text'].tolist()
one_string = ' '.join(sentences)
plt.figure(figsize=(20,10))
plt.imshow(WordCloud().generate(one_string))
Out[20]:
<matplotlib.image.AxesImage at 0x7f79a6dcb520>
In [21]:
# negative
negative_df = echo_df[echo_df['Rating']==1]
negative_df.shape
sentences = negative_df['Adj_text'].tolist()
one_string = ' '.join(sentences)
plt.figure(figsize=(20,10))
plt.imshow(WordCloud().generate(one_string))
Out[21]:
<matplotlib.image.AxesImage at 0x7f79a6e00a00>
728x90
'Data Analytics with python > [Data Analysis]' 카테고리의 다른 글
[Visualization] matplotlib (0) | 2023.01.22 |
---|---|
[Visualization] Basic_for _visualization (0) | 2023.01.22 |
[Text]S8_07_Text_visualization (0) | 2023.01.21 |
[Text]S8_06_Text_tokenization (0) | 2023.01.21 |
[Text]S8_04_Text_cleaning(removing_punctuation) (0) | 2023.01.21 |
댓글