When you search App store for an interesting game, read the long reviews, you are probably wondering which games are popular, which games are outdated. Twitter is a good resource to check games popularity. After determining the popularity, one will expect the keywords or labels of the favorite games in order to have a better understanding of the content. This analysis will help users choose the games app.
Scraped Data from twitter api
First of all, I scraped the data from Itunes app store to get a list of games app names, then used twitterscraper to search the keywords and store the tweets (twitterscraper doesn’t need auth and can prevent the block from twitter)
import dill
app_name = dill.load(open('data_stored/app_action_names','rb'))
app_name = [a for a in app_name if len(a)>7]
app_name.remove('Fortnite')import twitterscraper
from twitterscraper import query_tweetsimport datetime as dtdef get_tweet(app):
return query_tweets(app, limit=None,
begindate=dt.date.today()-dt.timedelta(days=7),
enddate=dt.date.today(),
poolsize=20,
lang='en'
)
Then used spacy entity to clean the data.
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()def tokenize(text):
lda_tokens = []
tokens = parser(text)for token in tokens:
if not token.like_url:
lda_tokens.append(token)
return lda_tokens
Store the data using dill for future reuse
import dill
dill.dump(data_array,open('data_stored/data_104.pkl', 'wb'))
After data wrangling, we can easily get the statistics. Here is the top 10 games information for one week.
Scrape Itune Review
def is_error_response(http_response, seconds_to_sleep: float = 1) -> bool:
if http_response.status_code == 503:
time.sleep(seconds_to_sleep)
return Falsereturn http_response.status_code != 200def get_json(url) -> typing.Union[dict, None]:
"""
Returns json response if any. Returns None if no json found.:param url:
The url go get the json from.
"""
response = requests.get(url)
if is_error_response(response):
return None
json_response = response.json()
return json_responsedef get_reviews(app_id, page=1) -> typing.List[dict]:
"""
Returns a list of dictionaries with each dictionary being one review.
:param app_id:
The app_id you are searching.
:param page:
The page id to start the loop. Once it reaches the final page + 1, the
app will return a non valid json, thus it will exit with the current
reviews.
"""
reviews: typing.List[dict] = [{}]while True:
url = (f'https://itunes.apple.com/rss/customerreviews/id={app_id}/'
f'page={page}/sortby=mostrecent/json')
json = get_json(url)if not json:
return reviewsdata_feed = json.get('feed')if not data_feed.get('entry'):
get_reviews(app_id, page + 1)
if data_feed.get('entry'):
reviews += [
{
'review_id': entry.get('id').get('label'),
'title': entry.get('title').get('label'),
'author': entry.get('author').get('name').get('label'),
'author_url': entry.get('author').get('uri').get('label'),
'version': entry.get('im:version').get('label'),
'rating': entry.get('im:rating').get('label'),
'review': entry.get('content').get('label'),
'vote_count': entry.get('im:voteCount').get('label')
}
for entry in data_feed.get('entry')
if not entry.get('im:name')
]page += 1
I searched the previous top games in Itunes Store and stored the reviews.
id_list = ['922558758','1053012308','905408749','603527166','1330123889','871809581',
'529479190','535609266','847492141','964436963','1252963998','1383187127',
'1109008423','572395608','933989137','1336527043','1236104279','423593206',
'1307961750','958763157']app_review=[]
for i in id_list:
reviews = get_reviews(int(i))
a=[]
for review in reviews:
if review:
a.append(review['review'])
b=' '.join(a)
app_review.append(b)import dill
dill.dump(app_review,open('data_stored/app_reivew.pkl', 'wb'))
Next step, I find the top words as the labels.
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import Englishnlp = spacy.load('en')def preprocess(doc):result=[]
tokens=nlp(doc)
for token in tokens:
if not token.is_punct and len(token) > 5 and not token.like_url and not token.is_stop\
and token.pos_=='NOUN'\
and token.lemma_.lower() not in ['play','player','update','upgrade','please','people','account','thing']:
result.append(token.lemma_.lower())
return result
review_df = pd.DataFrame({'app_review':app_review})
processed_docs = review_df['app_review'].map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]word_list=[]
for i in range(20):
word=[]
bow_doc_4310 = sorted(bow_corpus[i],key= lambda x: -x[1])[:5]
for j in range(5):
word.append(dictionary[bow_doc_4310[j][0]])
word_list.append(word)
The keyword helps users get the impression of the game. I also put the link of a website that analyzes fake review.
We can see that DomiNations involves ‘crown’, ‘attach’, ‘troop’, ‘alliance’, which clearly describe the game. Another example is Smash Hit, one of the words is ‘addictive’. Intuitively I want to try this game immediately.