In [1]:
# Import packages for this project
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
# Change some parameters
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
pd.options.mode.chained_assignment = None
In [2]:
# Read-in dataset
df = pd.read_csv('movies.csv')
In [4]:
df.head()
Out[4]:
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 |
| 1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 |
| 2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 |
| 3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 |
| 4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 |
In [5]:
# Check for missing data with a loop
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
name - 0% rating - 1% genre - 0% year - 0% released - 0% score - 0% votes - 0% director - 0% writer - 0% star - 0% country - 0% budget - 28% gross - 2% company - 0% runtime - 0%
In [6]:
# Check column datatypes
df.dtypes
Out[6]:
name object rating object genre object year int64 released object score float64 votes float64 director object writer object star object country object budget float64 gross float64 company object runtime float64 dtype: object
In [7]:
# Drop duplicates
df.drop_duplicates().head()
Out[7]:
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | The Shining | R | Drama | 1980 | June 13, 1980 (United States) | 8.4 | 927000.0 | Stanley Kubrick | Stephen King | Jack Nicholson | United Kingdom | 19000000.0 | 46998772.0 | Warner Bros. | 146.0 |
| 1 | The Blue Lagoon | R | Adventure | 1980 | July 2, 1980 (United States) | 5.8 | 65000.0 | Randal Kleiser | Henry De Vere Stacpoole | Brooke Shields | United States | 4500000.0 | 58853106.0 | Columbia Pictures | 104.0 |
| 2 | Star Wars: Episode V - The Empire Strikes Back | PG | Action | 1980 | June 20, 1980 (United States) | 8.7 | 1200000.0 | Irvin Kershner | Leigh Brackett | Mark Hamill | United States | 18000000.0 | 538375067.0 | Lucasfilm | 124.0 |
| 3 | Airplane! | PG | Comedy | 1980 | July 2, 1980 (United States) | 7.7 | 221000.0 | Jim Abrahams | Jim Abrahams | Robert Hays | United States | 3500000.0 | 83453539.0 | Paramount Pictures | 88.0 |
| 4 | Caddyshack | R | Comedy | 1980 | July 25, 1980 (United States) | 7.3 | 108000.0 | Harold Ramis | Brian Doyle-Murray | Chevy Chase | United States | 6000000.0 | 39846344.0 | Orion Pictures | 98.0 |
In [8]:
# Order data by gross
df.sort_values(by=['gross'], inplace=False, ascending=False).head()
Out[8]:
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5445 | Avatar | PG-13 | Action | 2009 | December 18, 2009 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Sam Worthington | United States | 237000000.0 | 2.847246e+09 | Twentieth Century Fox | 162.0 |
| 7445 | Avengers: Endgame | PG-13 | Action | 2019 | April 26, 2019 (United States) | 8.4 | 903000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 356000000.0 | 2.797501e+09 | Marvel Studios | 181.0 |
| 3045 | Titanic | PG-13 | Drama | 1997 | December 19, 1997 (United States) | 7.8 | 1100000.0 | James Cameron | James Cameron | Leonardo DiCaprio | United States | 200000000.0 | 2.201647e+09 | Twentieth Century Fox | 194.0 |
| 6663 | Star Wars: Episode VII - The Force Awakens | PG-13 | Action | 2015 | December 18, 2015 (United States) | 7.8 | 876000.0 | J.J. Abrams | Lawrence Kasdan | Daisy Ridley | United States | 245000000.0 | 2.069522e+09 | Lucasfilm | 138.0 |
| 7244 | Avengers: Infinity War | PG-13 | Action | 2018 | April 27, 2018 (United States) | 8.4 | 897000.0 | Anthony Russo | Christopher Markus | Robert Downey Jr. | United States | 321000000.0 | 2.048360e+09 | Marvel Studios | 149.0 |
In [15]:
# Linear regression between gross and budget
sns.regplot(x="budget", y="gross", data=df, scatter_kws={'color':'red'}, line_kws={'color':'blue'})
Out[15]:
<AxesSubplot:xlabel='budget', ylabel='gross'>
In [10]:
# Linear regression between score and gross
sns.regplot(x="score", y="gross", data=df)
Out[10]:
<AxesSubplot:xlabel='score', ylabel='gross'>
In [11]:
# Correlation Matrix between all numeric columns.
# 1) A value of -1 indicates a strong negative correlation
# 2) A value of 0 indicates no correlation
# 3) A value of 1 indicates a strong positive correlation
df.corr(method='pearson')
Out[11]:
| year | score | votes | budget | gross | runtime | |
|---|---|---|---|---|---|---|
| year | 1.000000 | 0.097995 | 0.222945 | 0.329321 | 0.257486 | 0.120811 |
| score | 0.097995 | 1.000000 | 0.409182 | 0.076254 | 0.186258 | 0.399451 |
| votes | 0.222945 | 0.409182 | 1.000000 | 0.442429 | 0.630757 | 0.309212 |
| budget | 0.329321 | 0.076254 | 0.442429 | 1.000000 | 0.740395 | 0.320447 |
| gross | 0.257486 | 0.186258 | 0.630757 | 0.740395 | 1.000000 | 0.245216 |
| runtime | 0.120811 | 0.399451 | 0.309212 | 0.320447 | 0.245216 | 1.000000 |
In [12]:
df.corr(method ='kendall')
Out[12]:
| year | score | votes | budget | gross | runtime | |
|---|---|---|---|---|---|---|
| year | 1.000000 | 0.067652 | 0.331465 | 0.224120 | 0.200618 | 0.097184 |
| score | 0.067652 | 1.000000 | 0.300115 | -0.000566 | 0.086046 | 0.283611 |
| votes | 0.331465 | 0.300115 | 1.000000 | 0.353702 | 0.548899 | 0.198240 |
| budget | 0.224120 | -0.000566 | 0.353702 | 1.000000 | 0.512637 | 0.235483 |
| gross | 0.200618 | 0.086046 | 0.548899 | 0.512637 | 1.000000 | 0.168933 |
| runtime | 0.097184 | 0.283611 | 0.198240 | 0.235483 | 0.168933 | 1.000000 |
In [13]:
df.corr(method ='spearman')
Out[13]:
| year | score | votes | budget | gross | runtime | |
|---|---|---|---|---|---|---|
| year | 1.000000 | 0.099045 | 0.469829 | 0.317336 | 0.293084 | 0.142977 |
| score | 0.099045 | 1.000000 | 0.428138 | -0.001403 | 0.126116 | 0.399857 |
| votes | 0.469829 | 0.428138 | 1.000000 | 0.502466 | 0.742050 | 0.290159 |
| budget | 0.317336 | -0.001403 | 0.502466 | 1.000000 | 0.693670 | 0.336370 |
| gross | 0.293084 | 0.126116 | 0.742050 | 0.693670 | 1.000000 | 0.246243 |
| runtime | 0.142977 | 0.399857 | 0.290159 | 0.336370 | 0.246243 | 1.000000 |
In [14]:
# Correlation Matrix Heat Map(High correlation between budget and gross(0.74))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot = True)
plt.title("Correlation Matrix For Numeric Features")
plt.xlabel("Movie Features")
plt.ylabel("Movie Features")
plt.show()
In [33]:
# Updated correlation matrix to include object datatypes(Convert categorical data to numerical data)
df_numerized = df
for col_name in df_numerized.columns:
if(df_numerized[col_name].dtype=='object'):
df_numerized[col_name] = df_numerized[col_name].astype('category')
df_numerized[col_name] = df_numerized[col_name].cat.codes
df_numerized
Out[33]:
| name | rating | genre | year | released | score | votes | director | writer | star | country | budget | gross | company | runtime | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6587 | 6 | 6 | 1980 | 1705 | 8.4 | 927000.0 | 2589 | 4014 | 1047 | 54 | 19000000.0 | 46998772.0 | 2319 | 146.0 |
| 1 | 5573 | 6 | 1 | 1980 | 1492 | 5.8 | 65000.0 | 2269 | 1632 | 327 | 55 | 4500000.0 | 58853106.0 | 731 | 104.0 |
| 2 | 5142 | 4 | 0 | 1980 | 1771 | 8.7 | 1200000.0 | 1111 | 2567 | 1745 | 55 | 18000000.0 | 538375067.0 | 1540 | 124.0 |
| 3 | 286 | 4 | 4 | 1980 | 1492 | 7.7 | 221000.0 | 1301 | 2000 | 2246 | 55 | 3500000.0 | 83453539.0 | 1812 | 88.0 |
| 4 | 1027 | 6 | 4 | 1980 | 1543 | 7.3 | 108000.0 | 1054 | 521 | 410 | 55 | 6000000.0 | 39846344.0 | 1777 | 98.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7663 | 3705 | -1 | 6 | 2020 | 2964 | 3.1 | 18.0 | 1500 | 2289 | 2421 | 55 | 7000.0 | NaN | -1 | 90.0 |
| 7664 | 1678 | -1 | 4 | 2020 | 1107 | 4.7 | 36.0 | 774 | 2614 | 1886 | 55 | NaN | NaN | 539 | 90.0 |
| 7665 | 4717 | -1 | 6 | 2020 | 193 | 5.7 | 29.0 | 2061 | 2683 | 2040 | 55 | 58750.0 | NaN | 941 | NaN |
| 7666 | 2843 | -1 | 6 | 2020 | 2817 | NaN | NaN | 1184 | 1824 | 450 | 55 | 15000.0 | NaN | -1 | 120.0 |
| 7667 | 5394 | -1 | 10 | 2020 | 391 | 5.7 | 7.0 | 2165 | 3344 | 2463 | 44 | NaN | NaN | 1787 | 102.0 |
7668 rows × 15 columns
In [34]:
# We can see clearly that there is also a high correlation between gross and votes(0.63))
correlation_matrix = df_numerized.corr()
sns.heatmap(correlation_matrix, annot = True)
plt.title("Correlation Matrix For All Features")
plt.xlabel("Movie Features")
plt.ylabel("Movie Features")
plt.show()
In [39]:
correlation_matrix = df_numerized.corr()
correlation_pairs = correlation_matrix.unstack()
In [40]:
correlation_pairs
Out[40]:
name name 1.000000
rating -0.008069
genre 0.016355
year 0.011453
released -0.011311
...
runtime country -0.078412
budget 0.320447
gross 0.245216
company 0.034402
runtime 1.000000
Length: 225, dtype: float64
In [46]:
sorted_pairs = correlation_pairs.sort_values()
In [47]:
sorted_pairs
Out[47]:
budget genre -0.356564
genre budget -0.356564
gross -0.235650
gross genre -0.235650
rating budget -0.176002
...
year year 1.000000
genre genre 1.000000
rating rating 1.000000
company company 1.000000
runtime runtime 1.000000
Length: 225, dtype: float64
In [48]:
high_correlation = sorted_pairs[(sorted_pairs) > 0.5]
high_correlation
Out[48]:
gross votes 0.630757 votes gross 0.630757 budget gross 0.740395 gross budget 0.740395 name name 1.000000 director director 1.000000 gross gross 1.000000 budget budget 1.000000 country country 1.000000 star star 1.000000 writer writer 1.000000 votes votes 1.000000 score score 1.000000 released released 1.000000 year year 1.000000 genre genre 1.000000 rating rating 1.000000 company company 1.000000 runtime runtime 1.000000 dtype: float64
In [ ]:
# Final verdict: Budget and votes have the highest correlation to gross revenue.
In [ ]:
In [18]:
# Top 15 companies by gross revenue
CompanyGrossSum = df.groupby('company')[["gross"]].sum()
CompanyGrossSumSorted = CompanyGrossSum.sort_values('gross', ascending = False)[:15]
CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64')
CompanyGrossSumSorted
Out[18]:
company Warner Bros. 56491421806 Universal Pictures 52514188890 Columbia Pictures 43008941346 Paramount Pictures 40493607415 Twentieth Century Fox 40257053857 Walt Disney Pictures 36327887792 New Line Cinema 19883797684 Marvel Studios 15065592411 DreamWorks Animation 11873612858 Touchstone Pictures 11795832638 Dreamworks Pictures 11635441081 Metro-Goldwyn-Mayer (MGM) 9230230105 Summit Entertainment 8373718838 Pixar Animation Studios 7886344526 Fox 2000 Pictures 7443502667 Name: gross, dtype: int64
In [19]:
# Total gross revenue generated by each company in each year(Top 15 sorted by gross revenue)
CompanyGrossSum = df.groupby(['company', 'year'])[["gross"]].sum()
CompanyGrossSumSorted = CompanyGrossSum.sort_values(['gross','company','year'], ascending = False)[:15]
CompanyGrossSumSorted = CompanyGrossSumSorted['gross'].astype('int64')
CompanyGrossSumSorted
Out[19]:
company year
Walt Disney Pictures 2019 5773131804
Marvel Studios 2018 4018631866
Universal Pictures 2015 3834354888
Twentieth Century Fox 2009 3793491246
Walt Disney Pictures 2017 3789382071
Paramount Pictures 2011 3565705182
Warner Bros. 2010 3300479986
2011 3223799224
Walt Disney Pictures 2010 3104474158
Paramount Pictures 2014 3071298586
Columbia Pictures 2006 2934631933
2019 2932757449
Marvel Studios 2019 2797501328
Warner Bros. 2018 2774168962
Columbia Pictures 2011 2738363306
Name: gross, dtype: int64
In [26]:
# Gross revenue and rating(PG-13 and PG more profitable)
sns.stripplot(x="rating", y="gross", data=df).set_title('Distribution Of Rating Over Gross')
Out[26]:
Text(0.5, 1.0, 'Distribution Of Rating Over Gross')
In [30]:
sns.violinplot(x="rating", y="gross", data=df, cut = 0).set_title('Distribution Of Rating Over Gross')
Out[30]:
Text(0.5, 1.0, 'Distribution Of Rating Over Gross')
In [ ]: