In [1]:
# Import packages for this project

import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

# Change some parameters

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

pd.options.mode.chained_assignment = None
In [2]:
# Read-in dataset

df = pd.read_csv('movies.csv')
In [4]:
df.head()
Out[4]:
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
In [5]:
# Check for missing data with a loop

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0%
rating - 1%
genre - 0%
year - 0%
released - 0%
score - 0%
votes - 0%
director - 0%
writer - 0%
star - 0%
country - 0%
budget - 28%
gross - 2%
company - 0%
runtime - 0%
In [6]:
# Check column datatypes

df.dtypes
Out[6]:
name         object
rating       object
genre        object
year          int64
released     object
score       float64
votes       float64
director     object
writer       object
star         object
country      object
budget      float64
gross       float64
company      object
runtime     float64
dtype: object
In [7]:
# Drop duplicates

df.drop_duplicates().head()
Out[7]:
name rating genre year released score votes director writer star country budget gross company runtime
0 The Shining R Drama 1980 June 13, 1980 (United States) 8.4 927000.0 Stanley Kubrick Stephen King Jack Nicholson United Kingdom 19000000.0 46998772.0 Warner Bros. 146.0
1 The Blue Lagoon R Adventure 1980 July 2, 1980 (United States) 5.8 65000.0 Randal Kleiser Henry De Vere Stacpoole Brooke Shields United States 4500000.0 58853106.0 Columbia Pictures 104.0
2 Star Wars: Episode V - The Empire Strikes Back PG Action 1980 June 20, 1980 (United States) 8.7 1200000.0 Irvin Kershner Leigh Brackett Mark Hamill United States 18000000.0 538375067.0 Lucasfilm 124.0
3 Airplane! PG Comedy 1980 July 2, 1980 (United States) 7.7 221000.0 Jim Abrahams Jim Abrahams Robert Hays United States 3500000.0 83453539.0 Paramount Pictures 88.0
4 Caddyshack R Comedy 1980 July 25, 1980 (United States) 7.3 108000.0 Harold Ramis Brian Doyle-Murray Chevy Chase United States 6000000.0 39846344.0 Orion Pictures 98.0
In [8]:
# Order data by gross

df.sort_values(by=['gross'], inplace=False, ascending=False).head()
Out[8]:
name rating genre year released score votes director writer star country budget gross company runtime
5445 Avatar PG-13 Action 2009 December 18, 2009 (United States) 7.8 1100000.0 James Cameron James Cameron Sam Worthington United States 237000000.0 2.847246e+09 Twentieth Century Fox 162.0
7445 Avengers: Endgame PG-13 Action 2019 April 26, 2019 (United States) 8.4 903000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 356000000.0 2.797501e+09 Marvel Studios 181.0
3045 Titanic PG-13 Drama 1997 December 19, 1997 (United States) 7.8 1100000.0 James Cameron James Cameron Leonardo DiCaprio United States 200000000.0 2.201647e+09 Twentieth Century Fox 194.0
6663 Star Wars: Episode VII - The Force Awakens PG-13 Action 2015 December 18, 2015 (United States) 7.8 876000.0 J.J. Abrams Lawrence Kasdan Daisy Ridley United States 245000000.0 2.069522e+09 Lucasfilm 138.0
7244 Avengers: Infinity War PG-13 Action 2018 April 27, 2018 (United States) 8.4 897000.0 Anthony Russo Christopher Markus Robert Downey Jr. United States 321000000.0 2.048360e+09 Marvel Studios 149.0
In [15]:
# Linear regression between gross and budget

sns.regplot(x="budget", y="gross", data=df, scatter_kws={'color':'red'}, line_kws={'color':'blue'})
Out[15]:
<AxesSubplot:xlabel='budget', ylabel='gross'>
No description has been provided for this image
In [10]:
# Linear regression between score and gross

sns.regplot(x="score", y="gross", data=df)
Out[10]:
<AxesSubplot:xlabel='score', ylabel='gross'>
No description has been provided for this image
In [11]:
# Correlation Matrix between all numeric columns.
# 1) A value of -1 indicates a strong negative correlation
# 2) A value of 0 indicates no correlation
# 3) A value of 1 indicates a strong positive correlation

df.corr(method='pearson')
Out[11]:
year score votes budget gross runtime
year 1.000000 0.097995 0.222945 0.329321 0.257486 0.120811
score 0.097995 1.000000 0.409182 0.076254 0.186258 0.399451
votes 0.222945 0.409182 1.000000 0.442429 0.630757 0.309212
budget 0.329321 0.076254 0.442429 1.000000 0.740395 0.320447
gross 0.257486 0.186258 0.630757 0.740395 1.000000 0.245216
runtime 0.120811 0.399451 0.309212 0.320447 0.245216 1.000000
In [12]:
df.corr(method ='kendall')
Out[12]:
year score votes budget gross runtime
year 1.000000 0.067652 0.331465 0.224120 0.200618 0.097184
score 0.067652 1.000000 0.300115 -0.000566 0.086046 0.283611
votes 0.331465 0.300115 1.000000 0.353702 0.548899 0.198240
budget 0.224120 -0.000566 0.353702 1.000000 0.512637 0.235483
gross 0.200618 0.086046 0.548899 0.512637 1.000000 0.168933
runtime 0.097184 0.283611 0.198240 0.235483 0.168933 1.000000
In [13]:
df.corr(method ='spearman')
Out[13]:
year score votes budget gross runtime
year 1.000000 0.099045 0.469829 0.317336 0.293084 0.142977
score 0.099045 1.000000 0.428138 -0.001403 0.126116 0.399857
votes 0.469829 0.428138 1.000000 0.502466 0.742050 0.290159
budget 0.317336 -0.001403 0.502466 1.000000 0.693670 0.336370
gross 0.293084 0.126116 0.742050 0.693670 1.000000 0.246243
runtime 0.142977 0.399857 0.290159 0.336370 0.246243 1.000000
In [14]:
# Correlation Matrix Heat Map(High correlation between budget and gross(0.74))

correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot = True)
plt.title("Correlation Matrix For Numeric Features")
plt.xlabel("Movie Features")
plt.ylabel("Movie Features")
plt.show()
No description has been provided for this image
In [33]:
# Updated correlation matrix to include object datatypes(Convert categorical data to numerical data)

df_numerized = df
for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype=='object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized             
Out[33]:
name rating genre year released score votes director writer star country budget gross company runtime
0 6587 6 6 1980 1705 8.4 927000.0 2589 4014 1047 54 19000000.0 46998772.0 2319 146.0
1 5573 6 1 1980 1492 5.8 65000.0 2269 1632 327 55 4500000.0 58853106.0 731 104.0
2 5142 4 0 1980 1771 8.7 1200000.0 1111 2567 1745 55 18000000.0 538375067.0 1540 124.0
3 286 4 4 1980 1492 7.7 221000.0 1301 2000 2246 55 3500000.0 83453539.0 1812 88.0
4 1027 6 4 1980 1543 7.3 108000.0 1054 521 410 55 6000000.0 39846344.0 1777 98.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7663 3705 -1 6 2020 2964 3.1 18.0 1500 2289 2421 55 7000.0 NaN -1 90.0
7664 1678 -1 4 2020 1107 4.7 36.0 774 2614 1886 55 NaN NaN 539 90.0
7665 4717 -1 6 2020 193 5.7 29.0 2061 2683 2040 55 58750.0 NaN 941 NaN
7666 2843 -1 6 2020 2817 NaN NaN 1184 1824 450 55 15000.0 NaN -1 120.0
7667 5394 -1 10 2020 391 5.7 7.0 2165 3344 2463 44 NaN NaN 1787 102.0

7668 rows × 15 columns

In [34]:
# We can see clearly that there is also a high correlation between gross and votes(0.63))
correlation_matrix = df_numerized.corr()
sns.heatmap(correlation_matrix, annot = True)
plt.title("Correlation Matrix For All Features")
plt.xlabel("Movie Features")
plt.ylabel("Movie Features")
plt.show()
No description has been provided for this image
In [39]:
correlation_matrix = df_numerized.corr()
correlation_pairs = correlation_matrix.unstack()
In [40]:
correlation_pairs
Out[40]:
name     name        1.000000
         rating     -0.008069
         genre       0.016355
         year        0.011453
         released   -0.011311
                       ...   
runtime  country    -0.078412
         budget      0.320447
         gross       0.245216
         company     0.034402
         runtime     1.000000
Length: 225, dtype: float64
In [46]:
sorted_pairs = correlation_pairs.sort_values()
In [47]:
sorted_pairs
Out[47]:
budget   genre     -0.356564
genre    budget    -0.356564
         gross     -0.235650
gross    genre     -0.235650
rating   budget    -0.176002
                      ...   
year     year       1.000000
genre    genre      1.000000
rating   rating     1.000000
company  company    1.000000
runtime  runtime    1.000000
Length: 225, dtype: float64
In [48]:
high_correlation = sorted_pairs[(sorted_pairs) > 0.5]
high_correlation
Out[48]:
gross     votes       0.630757
votes     gross       0.630757
budget    gross       0.740395
gross     budget      0.740395
name      name        1.000000
director  director    1.000000
gross     gross       1.000000
budget    budget      1.000000
country   country     1.000000
star      star        1.000000
writer    writer      1.000000
votes     votes       1.000000
score     score       1.000000
released  released    1.000000
year      year        1.000000
genre     genre       1.000000
rating    rating      1.000000
company   company     1.000000
runtime   runtime     1.000000
dtype: float64
In [ ]:
# Final verdict: Budget and votes have the highest correlation to gross revenue.
In [ ]:
 
In [18]:
# Top 15 companies by gross revenue

CompanyGrossSum = df.groupby('company')[["gross"]].sum()

CompanyGrossSumSorted = CompanyGrossSum.sort_values('gross', ascending = False)[:15]

CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64') 

CompanyGrossSumSorted
Out[18]:
company
Warner Bros.                 56491421806
Universal Pictures           52514188890
Columbia Pictures            43008941346
Paramount Pictures           40493607415
Twentieth Century Fox        40257053857
Walt Disney Pictures         36327887792
New Line Cinema              19883797684
Marvel Studios               15065592411
DreamWorks Animation         11873612858
Touchstone Pictures          11795832638
Dreamworks Pictures          11635441081
Metro-Goldwyn-Mayer (MGM)     9230230105
Summit Entertainment          8373718838
Pixar Animation Studios       7886344526
Fox 2000 Pictures             7443502667
Name: gross, dtype: int64
In [19]:
# Total gross revenue generated by each company in each year(Top 15 sorted by gross revenue)

CompanyGrossSum = df.groupby(['company', 'year'])[["gross"]].sum()

CompanyGrossSumSorted = CompanyGrossSum.sort_values(['gross','company','year'], ascending = False)[:15]

CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64') 

CompanyGrossSumSorted
Out[19]:
company                year
Walt Disney Pictures   2019    5773131804
Marvel Studios         2018    4018631866
Universal Pictures     2015    3834354888
Twentieth Century Fox  2009    3793491246
Walt Disney Pictures   2017    3789382071
Paramount Pictures     2011    3565705182
Warner Bros.           2010    3300479986
                       2011    3223799224
Walt Disney Pictures   2010    3104474158
Paramount Pictures     2014    3071298586
Columbia Pictures      2006    2934631933
                       2019    2932757449
Marvel Studios         2019    2797501328
Warner Bros.           2018    2774168962
Columbia Pictures      2011    2738363306
Name: gross, dtype: int64
In [26]:
# Gross revenue and rating(PG-13 and PG more profitable)

sns.stripplot(x="rating", y="gross", data=df).set_title('Distribution Of Rating Over Gross')
Out[26]:
Text(0.5, 1.0, 'Distribution Of Rating Over Gross')
No description has been provided for this image
In [30]:
sns.violinplot(x="rating", y="gross",  data=df, cut = 0).set_title('Distribution Of Rating Over Gross')
Out[30]:
Text(0.5, 1.0, 'Distribution Of Rating Over Gross')
No description has been provided for this image
In [ ]: