728x90
In [150]:
import pandas as pd
import string
import nltk # natural language processing (자연어 처리)
from nltk.corpus import stopwords
import gensim # 자연어 처리 중 토큰화
from gensim.utils import simple_preprocess
import matplotlib.pyplot as plt
import seaborn as sns
In [135]:
echo_df = pd.read_csv('Echodot2_Reviews.csv', encoding='utf-8')
echo_df.head(2)
Out[135]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
In [136]:
echo_df.dropna(subset=['Review Text'], how='any', axis = 0, inplace=True)
In [137]:
# A function to remove punctuations
def remove_punc(text):
Test_removed_punc = [ char for char in text if char not in string.punctuation ]
Test_punc_join = ''.join(Test_removed_punc)
return Test_punc_join
In [138]:
# download stopwords (불용어) - 감성분석
# 그 자체로 의미가 없는 것들
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Out[138]:
True
In [139]:
# Add more stopwords : extend
stop_words = stopwords.words('english')
stop_words.extend(['amazon','Amazon','echo','echo','echo','device','Dot','dot'])
In [140]:
# Remove stopwords and remove short words(less than 2 char)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in stop_words and len(token) >= 1:
result.append(token)
return result
In [141]:
echo_df['reviews_unpunc']= echo_df['Review Text'].apply(remove_punc)
echo_df['reviews_unpunc_unstopwords'] = echo_df['reviews_unpunc'].apply(preprocess)
In [142]:
# join the words into a string
echo_df['Adj_text'] = echo_df['reviews_unpunc_unstopwords'].apply(lambda x: ' '.join(x))
tokenization
In [143]:
# tokenization
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer() # instance
echo_CVec = vectorizer.fit_transform(echo_df['Adj_text'])
Visualization
In [144]:
# the number of words
echo_df['length'] = echo_df['reviews_unpunc_unstopwords'].apply(len)
In [145]:
# plot the histogram of the length column
echo_df['length'].hist(bins=100, figsize = (20,10) )
Out[145]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe5b6fb160>
In [146]:
# positive/negative
def feedback_class(row):
if '4' in str(row) or '5' in str(row):
return 1
else:
return 0
In [147]:
echo_df['feedback'] = echo_df['Rating'].apply(feedback_class)
echo_df.head()
Out[147]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | reviews_unpunc | reviews_unpunc_unstopwords | Adj_text | length | feedback | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Not great speakers | [great, speakers] | great speakers | 2 | 0 |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great little gagit | [great, little, gagit] | great little gagit | 3 | 1 |
2 | 5 | 9/8/2017 | Echo Dot | Awesome 👏🏽 | White | Awesome! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Awesome 👏🏽 | [awesome] | awesome | 1 | 1 |
3 | 5 | 10/19/2017 | Echo Dot | Love my Echo | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Love my Echo | [love] | love | 1 | 1 |
4 | 5 | 9/17/2017 | Echo Dot | Great device | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great device | [great] | great | 1 | 1 |
In [148]:
sns.countplot(x='feedback', data = echo_df)
Out[148]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe570a65b0>
In [149]:
plt.figure(figsize=(25,10))
sns.barplot(x='Configuration Text', y= 'Rating', data = echo_df, palette='deep')
plt.xticks(rotation='45')
Out[149]:
(array([0, 1]), <a list of 2 Text major ticklabel objects>)
In [151]:
sns.countplot(x='Rating', data= echo_df)
Out[151]:
<matplotlib.axes._subplots.AxesSubplot at 0x7efe59e8d4c0>
728x90
'Data Analytics with python > [Data Analysis]' 카테고리의 다른 글
[Visualization] Basic_for _visualization (0) | 2023.01.22 |
---|---|
[Text]S8_08_Word_Cloud (0) | 2023.01.21 |
[Text]S8_06_Text_tokenization (0) | 2023.01.21 |
[Text]S8_04_Text_cleaning(removing_punctuation) (0) | 2023.01.21 |
[Text]S8_03_Text_in_pandas_2 (0) | 2023.01.21 |
댓글