728x90
In [61]:
import pandas as pd
import nltk # natural language processing (자연어 처리)
from nltk.corpus import stopwords
import gensim # 자연어 처리
from gensim.utils import simple_preprocess
In [62]:
echo_df = pd.read_csv('Echodot2_Reviews.csv', encoding='utf-8')
echo_df.head(2)
Out[62]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
In [63]:
echo_df.dropna(subset=['Review Text'], how='any', axis = 0, inplace=True)
In [64]:
# contains the word 'love'
mask = echo_df['Review Text'].str.lower().str.contains('love')
echo_df[mask]
Out[64]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | |
---|---|---|---|---|---|---|---|---|---|---|
3 | 5 | 10/19/2017 | Echo Dot | Love my Echo | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
11 | 5 | 10/9/2017 | Echo Dot | Alexa...You rock!! OMG people. I am not a tech... | Black | This is the greatest thing since chocolate | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
13 | 5 | 9/20/2017 | Echo Dot | I love using Alexa with the smart outlets for ... | White | Love it! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
17 | 5 | 10/8/2017 | Echo Dot | Cant say enough !!I love my DOT!!!! | Black | Happy with her Dot | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
20 | 5 | 9/8/2017 | Echo Dot | Love the echo! | Black | This is the second one for the house. | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6845 | 5 | 9/13/2017 | Echo Dot | We now have 4 Dots & one Show in the house. Pe... | White | Perfect for everyone in our family | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
6846 | 5 | 9/4/2017 | Echo Dot | Alexa is exceptional, I am getting use to ever... | Black | From what I know so far I love it. | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
6849 | 5 | 9/14/2017 | Echo Dot | Love it. | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
6850 | 5 | 9/17/2017 | Echo Dot | This is so much fun! I love her. | Black | In love with Alexa!! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
6853 | 5 | 9/27/2017 | Echo Dot | I have now set Alexa up to control lights in m... | Black | Simply fabulous! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... |
1963 rows × 10 columns
In [65]:
import string
string.punctuation
Out[65]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
In [66]:
# A function to remove punctuations
def remove_punc(text):
Test_removed_punc = [ char for char in text if char not in string.punctuation ]
Test_punc_join = ''.join(Test_removed_punc)
return Test_punc_join
In [67]:
echo_df['reviews_unpunc']= echo_df['Review Text'].apply(remove_punc)
In [68]:
# download stopwords (불용어) - 감성분석
# 그 자체로 의미가 없는 것들
nltk.download('stopwords')
stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
Out[68]:
['i',
'me',
'my',
'myself',
'we',
'our',
'ours',
'ourselves',
'you',
"you're",
"you've",
"you'll",
"you'd",
'your',
'yours',
'yourself',
'yourselves',
'he',
'him',
'his',
'himself',
'she',
"she's",
'her',
'hers',
'herself',
'it',
"it's",
'its',
'itself',
'they',
'them',
'their',
'theirs',
'themselves',
'what',
'which',
'who',
'whom',
'this',
'that',
"that'll",
'these',
'those',
'am',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'having',
'do',
'does',
'did',
'doing',
'a',
'an',
'the',
'and',
'but',
'if',
'or',
'because',
'as',
'until',
'while',
'of',
'at',
'by',
'for',
'with',
'about',
'against',
'between',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'to',
'from',
'up',
'down',
'in',
'out',
'on',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'any',
'both',
'each',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
's',
't',
'can',
'will',
'just',
'don',
"don't",
'should',
"should've",
'now',
'd',
'll',
'm',
'o',
're',
've',
'y',
'ain',
'aren',
"aren't",
'couldn',
"couldn't",
'didn',
"didn't",
'doesn',
"doesn't",
'hadn',
"hadn't",
'hasn',
"hasn't",
'haven',
"haven't",
'isn',
"isn't",
'ma',
'mightn',
"mightn't",
'mustn',
"mustn't",
'needn',
"needn't",
'shan',
"shan't",
'shouldn',
"shouldn't",
'wasn',
"wasn't",
'weren',
"weren't",
'won',
"won't",
'wouldn',
"wouldn't"]
In [69]:
# Add more stopwords : extend
stop_words = stopwords.words('english')
stop_words.extend(['amazon','Amazon','alexa','echo','Alexa','device','Dot','dot'])
In [70]:
# simple_preprocess: convert a string into a series of lowered case tokens
gensim.utils.simple_preprocess(echo_df['reviews_unpunc'][0])
Out[70]:
['not', 'great', 'speakers']
In [72]:
# Remove stopwords and remove short words(less than 2 char)
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in stop_words and len(token) >= 1:
result.append(token)
return result
In [73]:
echo_df['reviews_unpunc_unstopwords'] = echo_df['reviews_unpunc'].apply(preprocess)
In [76]:
echo_df['reviews_unpunc'][500]
Out[76]:
'I love the echo dot and this is my second purchase They are so handy and cute we decided to buy one for the bedroom The sound is good but we often listen to music and decided to buy a small 30 Doss Soundbox xs speaker also at Amazon and it has an incredible sound and range I love this gadget and would recommend it to anyone'
In [78]:
echo_df['reviews_unpunc_unstopwords'][500]
Out[78]:
['love',
'second',
'purchase',
'handy',
'cute',
'decided',
'buy',
'one',
'bedroom',
'sound',
'good',
'often',
'listen',
'music',
'decided',
'buy',
'small',
'doss',
'soundbox',
'xs',
'speaker',
'also',
'incredible',
'sound',
'range',
'love',
'gadget',
'would',
'recommend',
'anyone']
In [80]:
# join the words into a string
echo_df['Adj_text'] = echo_df['reviews_unpunc_unstopwords'].apply(lambda x: ' '.join(x))
In [82]:
echo_df.head()
Out[82]:
Rating | Review Date | Configuration Text | Review Text | Review Color | Title | User Verified | Review Useful Count | Declaration Text | Pageurl | reviews_unpunc | reviews_unpunc_unstopwords | Adj_text | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 10/3/2017 | Echo Dot | Not great speakers | Black | Three Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Not great speakers | [great, speakers] | great speakers |
1 | 4 | 9/26/2017 | Echo Dot | Great little gagit | White | Four Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great little gagit | [great, little, gagit] | great little gagit |
2 | 5 | 9/8/2017 | Echo Dot | Awesome 👏🏽 | White | Awesome! | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Awesome 👏🏽 | [awesome] | awesome |
3 | 5 | 10/19/2017 | Echo Dot | Love my Echo | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Love my Echo | [love] | love |
4 | 5 | 9/17/2017 | Echo Dot | Great device | Black | Five Stars | Verified Purchase | NaN | NaN | https://www.amazon.com/All-New-Amazon-Echo-Dot... | Great device | [great] | great |
728x90
댓글