#general purpose packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Customize filename
filename = "/Users/michiganboy/Documents/course1/assets/omicron.csv"
omicron = pd.read_csv(filename)


omicron


omicron.describe()


omicron.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39168 entries, 0 to 39167
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                39168 non-null  int64 
 1   user_name         39168 non-null  object
 2   user_location     30289 non-null  object
 3   user_description  36808 non-null  object
 4   user_created      39168 non-null  object
 5   user_followers    39168 non-null  int64 
 6   user_friends      39168 non-null  int64 
 7   user_favourites   39168 non-null  int64 
 8   user_verified     39168 non-null  bool  
 9   date              39168 non-null  object
 10  text              39168 non-null  object
 11  hashtags          28915 non-null  object
 12  source            39168 non-null  object
 13  retweets          39168 non-null  int64 
 14  favorites         39168 non-null  int64 
 15  is_retweet        39168 non-null  bool  
dtypes: bool(2), int64(6), object(8)
memory usage: 4.3+ MB


tweets_per_day = (pd.to_datetime(omicron['date'])
                  .dt.strftime('%m-%d')
                  .value_counts()
                  .sort_index()
                  .reset_index(name='counts')
                 )


tweets_per_day


### BEGIN SOLUTION

plt.figure(figsize=(10,7))
ax = sns.barplot(x='index', y='counts', data=tweets_per_day,edgecolor = 'black',ci=False, palette='Blues_r')
plt.title('Tweets count by date')
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')

### END SOLUTION

Text(0.5, 0, '')


# The code of retrieving the number of tweets per hour could be retrieved from this code.
tweets_per_hour = (pd.to_datetime(omicron['date'])
                  .dt.strftime('%H')
                  .value_counts()
                  .sort_index()
                  .reset_index(name='counts')
                 )


tweets_per_hour


### BEGIN SOLUTION

plt.figure(figsize=(13, 8))
ax = sns.barplot(x='index', y='counts', data=tweets_per_hour,edgecolor = 'black',ci=False, palette='mako_r')
plt.title('Tweets count by hour')
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')

### END SOLUTION

Text(0.5, 0, '')


##CUSTOM DEFINED FUNCTIONS TO CLEAN THE TWEETS
import re
import string
#Clean emojis from text
def strip_emoji(text):
    return re.sub(emoji.get_emoji_regexp(), r"", text) #remove emoji

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): ## remove multiple spaces
    return re.sub("\s\s+" , " ", text)

# We apply all the three custom functions to the raw text of the tweets.

texts_new = []
for t in omicron.text:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))
omicron['text_clean'] = texts_new

# We create a list which will contain the number of words in each tweet and eventually plot it.
text_len = []
for text in omicron.text_clean:
    tweet_len = len(text.split())
    text_len.append(tweet_len)
omicron['text_len'] = text_len


### BEGIN SOLUTION

# 1) uninformative prior

# 2) informative prior

### END SOLUTION


### BEGIN SOLUTION

plt.figure(figsize=(10,5))
sns.kdeplot(x = 'text_len', data = omicron)
#sns.histplot(x='text_len', data=omicron, bins=20)
plt.title('Cleaned text length')

### END SOLUTION

Text(0.5, 1.0, 'Cleaned text length')


# We may need a bigger plot size to clearly show the relationship

### BEGIN SOLUTION 
plt.figure(figsize=(13, 8))
sns.jointplot(x = "user_followers", y = "retweets",
              kind = "reg", data = omicron, scatter_kws={"s": 5})

### END SOLUTION

<seaborn.axisgrid.JointGrid at 0x7fce17894640>

<Figure size 936x576 with 0 Axes>


# What about changing both the user_followers and retweets into logarithmic scale? Does it look different?

plt.figure(figsize=(13, 8))
sns.jointplot(x = np.log(omicron["user_followers"] + 1), y = np.log(omicron["retweets"] +1),
              kind = "reg", scatter_kws={"s": 10}, joint_kws={'line_kws':{'color':'yellow'}})

<seaborn.axisgrid.JointGrid at 0x7fce1789d430>

<Figure size 936x576 with 0 Axes>


### BEGIN SOLUTION
f, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(omicron.corr(), annot = True,
           linewidth = .6, fmt='.1f', ax = ax)
### END SOLUTION

<AxesSubplot:>

	id	user_name	user_location	user_description	user_created	user_followers	user_friends	user_favourites	user_verified	date	text	hashtags	source	retweets	favorites	is_retweet
0	1465693385088323591	Abaris	Hants	Would appear on Blogger (did originally), but...	2009-09-16 14:30:32	2880	4369	26907	False	2021-11-30 14:45:08	@SkyNews "Told you I'd be Back!" #OMICRON “Odi...	['OMICRON']	Twitter Web App	0	0	False
1	1465693062999412746	GFTs 	Lalaland	There's a field somewhere beyond all doubt and...	2019-12-28 14:29:13	165	583	21152	False	2021-11-30 14:43:52	Someone told me this in October #Omicron https...	['Omicron']	Twitter for Android	0	0	False
2	1465690116442279942	Herbie Finkle (Cozy)	NaN	help me find me frens ❤️	2021-07-10 09:40:57	114	393	2339	False	2021-11-30 14:32:09	Glad to see the public schoolkids are wrapping...	['COVID']	Twitter Web App	0	1	False
3	1465689607165591552	Electrical Review	United Kingdom	Electrical Review is a monthly journal aimed a...	2009-05-21 08:32:19	20759	2321	739	False	2021-11-30 14:30:08	#Automation systems have become increasingly c...	['Automation']	Hootsuite Inc.	0	0	False
4	1465688203709464578	BingX Academy 🔑	NaN	BingX (Prev. Bingbon) is the world's top crypt...	2013-12-31 02:57:32	17134	8	31	False	2021-11-30 14:24:33	🟢 If u think Omicron is a FUD, you LONG📈\n🔴 If...	NaN	Twitter Web App	2	2	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
39163	1471495763880386560	Mahaan	NaN	Matured Dalapathy fan from kerala❤ Dancer of k...	2021-10-13 10:04:38	24	57	1963	False	2021-12-16 15:01:43	@aanakattil_ @arunjlak1 Anils insecurity about...	['Thala']	Twitter for Android	3	4	False
39164	1471495731005427718	Allison Bell	New York, NY	Covers life & health insurance for @TA_LifeHea...	2009-08-19 14:33:37	6980	7677	6725	False	2021-12-16 15:01:35	View>Evidence that a big #Omicron wave is s...	['Omicron']	Twitter for iPhone	0	0	False
39165	1471495729772306447	Alan	Ontario, Canada	🇨🇦	2011-10-10 19:58:08	431	665	17564	False	2021-12-16 15:01:35	More than double the cases for vaccinated than...	NaN	Twitter Web App	0	1	False
39166	1471495686105759750	Douglas MacDonald	Jeju Island, South Korea	Canadian 🇨🇦 photojournalist living in #Jeju #S...	2009-10-26 02:27:06	6136	2622	2314	False	2021-12-16 15:01:25	How does #Omicron spread so fast? Virus may no...	['Omicron']	Twitter for Android	0	0	False
39167	1471495681940492296	Garreth McDaid	Leitrim, Ireland	#Bitcoin	2009-03-23 17:02:45	1601	803	11868	False	2021-12-16 15:01:24	I can understand people wanting #omicron to be...	['omicron']	Twitter Web App	0	59	False

	id	user_followers	user_friends	user_favourites	retweets	favorites
count	3.916800e+04	3.916800e+04	39168.000000	3.916800e+04	39168.000000	39168.000000
mean	1.468352e+18	2.041943e+05	1678.257251	1.731431e+04	1.814874	6.091018
std	1.988668e+15	1.278784e+06	8965.896558	5.144423e+04	14.194947	47.194959
min	1.465648e+18	0.000000e+00	0.000000	0.000000e+00	0.000000	0.000000
25%	1.466591e+18	1.190000e+02	96.000000	3.110000e+02	0.000000	0.000000
50%	1.468111e+18	8.170000e+02	423.000000	2.075500e+03	0.000000	1.000000
75%	1.470467e+18	5.436250e+03	1437.000000	1.255650e+04	1.000000	2.000000
max	1.471584e+18	1.645623e+07	384929.000000	1.376481e+06	877.000000	2376.000000

	index	counts
0	11-30	3168
1	12-01	3000
2	12-02	3000
3	12-03	3185
4	12-04	2815
5	12-05	3000
6	12-07	4346
7	12-08	1654
8	12-10	1141
9	12-11	1859
10	12-13	3000
11	12-14	3000
12	12-15	3000
13	12-16	3000

	index	counts
0	00	1090
1	01	1216
2	02	1385
3	03	1960
4	04	2018
5	05	2033
6	06	2612
7	07	2516
8	08	1994
9	09	2008
10	10	2216
11	11	3240
12	12	2924
13	13	1805
14	14	1922
15	15	985
16	16	1002
17	17	1091
18	18	1098
19	19	1108
20	20	1284
21	21	466
22	22	559
23	23	636

Twitter Analysis about new Omicron Variant¶

Data Import¶

Question 1: How many tweets are there in the dataset? Are there any missing data in user description, user location and hashtags?¶

Question 2: How many tweets are disseminated per day? On which day witnessed the highest and lowest number of tweets about Omicron?¶

Question 3: How many tweets are disseminated in each hour throughout all days? On which hour witnessed the highest and lowest number of tweets about Omicron?¶

Question 4: What is the distribution of text length among Twitter posts about Omicron?¶

Text Cleaning¶

Question 5: What is the correlation of the number of Twitter followers and the number of retweets for each Omicron user topic?¶

Question 6: Draw a heatmap. What does the correlation between variables tell you about Omicron?¶