In [1]:
from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from collections import Counter
import seaborn as sns
import time
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\cbots\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\cbots\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date!
In [2]:
api_key = 'AIzaSyB52Lo63T7JgqjIwetBbRMxGOODhOJ2NDo'
In [3]:
channel_id = 'UCweCc7bSMX5J4jEH7HFImng'
In [4]:
api_service_name = "youtube"
api_version = "v3"
# Get credentials and create an API client
youtube = build(api_service_name, api_version, developerKey=api_key)
In [5]:
# This function returns a DataFrame with basic channel information:channelName,subscribers,views,totalVideos,playlistId
def get_channel_stats(youtube, channel_id):
all_data = []
request = youtube.channels().list(
part="snippet,contentDetails,statistics",
id= channel_id
)
response = request.execute()
# loop through items
for item in response['items']:
data = {'channelName': item['snippet']['title'],
'subscribers': item['statistics']['subscriberCount'],
'views': item['statistics']['viewCount'],
'totalVideos': item['statistics']['videoCount'],
'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
}
all_data.append(data)
return(pd.DataFrame(all_data))
In [6]:
channel_stats = get_channel_stats(youtube, channel_id)
In [7]:
channel_stats
Out[7]:
| channelName | subscribers | views | totalVideos | playlistId | |
|---|---|---|---|---|---|
| 0 | GMHikaru | 1760000 | 474828454 | 2296 | UUweCc7bSMX5J4jEH7HFImng |
In [8]:
def total_videos(youtube, channel_id):
request = youtube.channels().list(
part="snippet,contentDetails,statistics",
id = channel_id
)
response = request.execute()
for item in response['items']:
total_videos = item['statistics']['videoCount']
return total_videos
In [9]:
videos = total_videos(youtube, channel_id)
videos
Out[9]:
'2296'
In [10]:
# This code returns the playlist_id of the channel
def get_playlist_id(youtube, channel_id):
playlist_id = []
request = youtube.channels().list(
part="snippet,contentDetails",
id = channel_id
)
response = request.execute()
for item in response['items']:
if 'contentDetails' in item and 'relatedPlaylists' in item['contentDetails']:
playlists = item['contentDetails']['relatedPlaylists']
playlist_id.extend(list(playlists.values()))
return playlist_id
In [11]:
playlist_id = get_playlist_id(youtube, channel_id)
In [12]:
playlist_id
Out[12]:
['', 'UUweCc7bSMX5J4jEH7HFImng']
In [13]:
# This code retrieves 50 video_ids per request(maxResults=50).We put a restriction
# so that we don't exceed the YouTube API default quota allocation of 10,000 units per day.
# def get_video_ids(youtube, channel_id):
# video_ids = []
# next_page_token = None
# while True:
# request = youtube.search().list(
# part="id",
# channelId=channel_id,
# type='video',
# maxResults=50,
# pageToken=next_page_token
# )
# response = request.execute()
# page_video_count = len(response['items'])
# print(f"Number of video IDs in this page: {page_video_count}")
# for item in response['items']:
# video_ids.append(item['id']['videoId'])
# next_page_token = response.get('nextPageToken', None)
# print(f"nextPageToken: {next_page_token}")
# if not next_page_token:
# break
# time.sleep(30)
# return video_ids
def get_video_ids(youtube, channel_id):
video_ids = []
processed_video_ids = set()
# Try to load the list of processed video IDs from a file
try:
with open('processed_video_ids.txt', 'r') as file:
for line in file:
processed_video_ids.add(line.strip())
except FileNotFoundError:
# If the file doesn't exist, just create an empty set
pass
next_page_token = None
while True:
request = youtube.search().list(
part="id",
channelId=channel_id,
type='video',
maxResults=50,
pageToken=next_page_token
)
response = request.execute()
for item in response['items']:
video_id = item['id']['videoId']
if video_id not in processed_video_ids:
video_ids.append(video_id)
processed_video_ids.add(video_id)
next_page_token = response.get('nextPageToken', None)
if not next_page_token:
break
time.sleep(5)
# Save the list of processed video IDs to a file
with open('processed_video_ids.txt', 'w') as file:
for video_id in processed_video_ids:
file.write(f"{video_id}\n")
return video_ids
In [ ]:
total_video_ids = get_video_ids(youtube, channel_id)
In [15]:
len(total_video_ids)
Out[15]:
496
In [25]:
# This code returns a DataFrame with important channel information.
def get_video_details(youtube, total_video_ids):
all_video_info = []
for i in range(0, len(total_video_ids), 50):
request = youtube.videos().list(
part="snippet,contentDetails,statistics",
id=','.join(total_video_ids[i:i+50])
)
response = request.execute()
for video in response['items']:
stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
'contentDetails': ['duration', 'definition', 'caption']
}
video_info = {}
video_info['video_id'] = video['id']
for k in stats_to_keep.keys():
for v in stats_to_keep[k]:
try:
video_info[v] = video[k][v]
except:
video_info[v] = None
all_video_info.append(video_info)
return pd.DataFrame(all_video_info)
In [26]:
video_df = get_video_details(youtube, total_video_ids)
video_df
Out[26]:
| video_id | channelTitle | title | description | tags | publishedAt | viewCount | likeCount | favouriteCount | commentCount | duration | definition | caption | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | xfWkP1LsYpY | GMHikaru | Titled Tuesday Part 2, The Race to the Finish | Titled Tuesday Part 2, The Race to the Finish\... | [Hikaru Nakamura, master chess, chess instruct... | 2022-01-27T21:15:02Z | 61090 | 1245 | None | 54 | PT50M49S | hd | true |
| 1 | dyuxOOAB5TE | GMHikaru | How Not to Mate! | ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-15T22:15:02Z | 59036 | 3048 | None | 45 | PT28S | hd | false |
| 2 | PNr9Olps03E | GMHikaru | Even Hikaru Can't Get En Peasant | Hikaru reviews a blunder he nearly made\n\n♟️ ... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-14T10:15:01Z | 87625 | 5179 | None | 83 | PT26S | hd | false |
| 3 | S-f1YdZ35pU | GMHikaru | Chess Is About Breaking the Rules | Botez Gamb... | Hikaru speedruns without a queen and reminds u... | [Hikaru Nakamura, master chess, chess instruct... | 2022-01-31T22:45:01Z | 104431 | 2457 | None | 145 | PT20M14S | hd | true |
| 4 | LKQ2F_KEKtg | GMHikaru | Andrea Botez Refuses Hikaru | ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-13T22:15:00Z | 65773 | 2866 | None | 32 | PT27S | hd | false |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 491 | kGfk2f__t8M | GMHikaru | GM Hikaru Nakamura vs RUSSIA | Speed Chess Sup... | GM Hikaru Nakamura vs RUSSIA | Speed Chess Sup... | [Hikaru Nakamura, master chess, chess instruct... | 2020-10-09T06:45:01Z | 1189219 | 7116 | None | 409 | PT3H14M46S | hd | false |
| 492 | cJI6ytWi9L8 | GMHikaru | Hikaru's on Fire! Check Out This Mate!! | Check out this wild mate in Titled Tuesday Ear... | [Hikaru Nakamura, master chess, chess instruct... | 2022-07-15T07:15:01Z | 318437 | 3615 | None | 167 | PT2H3M23S | hd | true |
| 493 | WMnPXSZ-_AY | GMHikaru | Self Mating Ain't Easy | Hikaru solves a deviously difficult self-mate ... | [Hikaru Nakamura, master chess, chess instruct... | 2021-06-22T20:15:01Z | 83530 | 2729 | None | 173 | PT20M58S | hd | true |
| 494 | CI3ZEVIIGy0 | GMHikaru | Polymarket Magnus vs Wesley So Final with Hika... | In the FTX Crypto Finale Magnus Carlsen played... | [hikaru nakamura, hikaru chess commentary, che... | 2021-06-05T08:15:00Z | 137265 | 2005 | None | 128 | PT2H57M25S | hd | true |
| 495 | TqJikTiDZE8 | GMHikaru | Sacrifices Were Made vs Anish Giri | Sacrifices Were Made by Hikaru in Rapid Chess ... | [Hikaru Nakamura, master chess, chess instruct... | 2022-05-13T19:30:01Z | 86697 | 1936 | None | 100 | PT51M1S | hd | true |
496 rows × 13 columns
In [27]:
# Check for NULL values
video_df.isnull().any()
Out[27]:
video_id False channelTitle False title False description False tags False publishedAt False viewCount False likeCount False favouriteCount True commentCount False duration False definition False caption False dtype: bool
In [28]:
# Check data types
video_df.dtypes
Out[28]:
video_id object channelTitle object title object description object tags object publishedAt object viewCount object likeCount object favouriteCount object commentCount object duration object definition object caption object dtype: object
In [29]:
# Convert count columns to numeric
numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)
In [30]:
# Extract weekday name from publish date
from dateutil import parser
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x))
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))
In [31]:
# Convert duration to seconds
import isodate
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
In [32]:
# Add tagCount column
video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))
In [33]:
video_df.sort_values('viewCount',ascending=False)
Out[33]:
| video_id | channelTitle | title | description | tags | publishedAt | viewCount | likeCount | favouriteCount | commentCount | duration | definition | caption | pushblishDayName | durationSecs | tagCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 311 | dJmlDPamgso | GMHikaru | I'm Already Live! | 4 Player Chess with @Botez... | 4 Player Chess with Hikaru, Alexandra Botez, G... | [chess, Hikaru Nakamura, grandmaster, super-gr... | 2020-06-13 15:00:12+00:00 | 5878000.0 | 114563.0 | NaN | 2692.0 | PT50M58S | hd | true | Saturday | 3058.0 | 30 |
| 243 | C-DXAaSiGxA | GMHikaru | How To Troll A Super Grandmaster | Hikaru plays more games against Coach Dominic.... | [Hikaru Nakamura, chess game play, monroe Stre... | 2021-09-19 08:15:00+00:00 | 5707321.0 | 112914.0 | NaN | 3140.0 | PT10M31S | hd | true | Sunday | 631.0 | 26 |
| 220 | mZ5yqDtkQXI | GMHikaru | HikaruBot AI Plays Hikaru's Openings Against Him | Hikaru attempts to beat his shadowy and myster... | [chess, Hikaru Nakamura, grandmaster, super-gr... | 2020-07-09 14:45:00+00:00 | 5692581.0 | 135817.0 | NaN | 4522.0 | PT15M50S | hd | true | Thursday | 950.0 | 35 |
| 254 | EYB7NOSY-hI | GMHikaru | Mittens The Chess Bot Will Make You Quit Chess | Hikaru plays against the 1 elo rated Mittens c... | [Hikaru Nakamura, master chess, chess instruct... | 2023-01-04 12:00:13+00:00 | 4418693.0 | 121437.0 | NaN | 3885.0 | PT21M44S | hd | true | Wednesday | 1304.0 | 16 |
| 317 | 9iB4mp5_ANI | GMHikaru | 15 Year Old Causes Carlsen to Lose His Cool | Hikaru watches the Chessbase India video15-yea... | [Hikaru Nakamura, master chess, chess instruct... | 2020-12-15 07:45:00+00:00 | 4402255.0 | 83450.0 | NaN | 2809.0 | PT15M55S | hd | true | Tuesday | 955.0 | 17 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11 | 7FhuQcRhzyY | GMHikaru | It Doesn't Fit in the Hole! | ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-14 22:15:00+00:00 | 33051.0 | 1788.0 | NaN | 41.0 | PT38S | hd | false | Tuesday | 38.0 | 12 |
| 35 | mN-guPtzZh0 | GMHikaru | What is the Wooden Shield? xqcL | @fionchetta and @JonLudvigHammerChess explain ... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-16 11:18:13+00:00 | 31949.0 | 1106.0 | NaN | 21.0 | PT44S | hd | false | Thursday | 44.0 | 15 |
| 84 | BDOKWgL25SU | GMHikaru | Firouzja in Fashion? | Hikaru reads a @chess article on Firouzja's Ne... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-15 23:15:00+00:00 | 30308.0 | 877.0 | NaN | 108.0 | PT10M37S | hd | false | Wednesday | 637.0 | 19 |
| 244 | jZgGV3BMKVA | GMHikaru | Here or here? | ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-19 22:15:02+00:00 | 25566.0 | 1716.0 | NaN | 37.0 | PT16S | hd | false | Sunday | 16.0 | 12 |
| 266 | 99dCcy9FBOE | GMHikaru | Beating 400s With 6 Knights | ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... | [Hikaru Nakamura, master chess, chess instruct... | 2023-02-20 10:15:00+00:00 | 21894.0 | 1538.0 | NaN | 21.0 | PT28S | hd | false | Monday | 28.0 | 12 |
496 rows × 16 columns
In [34]:
# Best Performing Videos
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000000) + 'M'))
In [35]:
# Worst Performing Videos
ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000) + 'K'))
In [36]:
# View Distribution Per Video
sns.violinplot(video_df['channelTitle'], video_df['viewCount'])
C:\Users\cbots\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Out[36]:
<AxesSubplot:xlabel='channelTitle', ylabel='viewCount'>
In [37]:
# Views vs. Comments And Likes
fig, ax = plt.subplots(1,2)
sns.scatterplot(data = video_df, x = 'commentCount', y = 'viewCount', ax = ax[0])
sns.scatterplot(data = video_df, x = 'likeCount', y = 'viewCount', ax = ax[1])
Out[37]:
<AxesSubplot:xlabel='likeCount', ylabel='viewCount'>
In [38]:
# Video Duration
sns.histplot(data = video_df, x = 'durationSecs', bins=30)
Out[38]:
<AxesSubplot:xlabel='durationSecs', ylabel='Count'>
In [39]:
# Wordcloud For Video Titles
stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])
all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words)
knight_shape = np.array(Image.open("knight_shape.png"))
def plot_cloud(wordcloud):
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud)
plt.axis("off");
wordcloud = WordCloud(random_state=1, background_color='black',
colormap='viridis', collocations=False,mask=knight_shape,
contour_width=0.5,contour_color='yellow').generate(all_words_str)
plot_cloud(wordcloud)
# knight_shape = np.array(Image.open("knight_shape4.png"))
# wc = WordCloud(width=2000, height=1000, random_state=1,colormap='viridis',background_color='white',
# stopwords=stop_words,collocations=False,mask=knight_shape,contour_width=1,contour_color='black')
# wc.generate(all_words_str)
# image_colors = ImageColorGenerator(knight_shape)
# wc.recolor(color_func = image_colors)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()
In [40]:
# Upload Schedule
day_df = pd.DataFrame(video_df['pushblishDayName'].value_counts())
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_df = day_df.reindex(weekdays)
ax = day_df.reset_index().plot.bar(x='index', y='pushblishDayName', rot=0)
In [41]:
# List of the 10 most frequent words and their frequency percentages
# Concatenate the title and description columns and change the column type to string.
text = video_df['title'].str.cat(video_df['description'], sep=' ').to_string()
# Tokenize the text and remove stop words
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
# Get the word frequency count
word_frequency = Counter(tokens)
total_words = sum(word_frequency.values())
# Get the top 10 most frequent words
top_words = word_frequency.most_common(10)
# Plot the frequency of the top 10 words
words = [word[0] for word in top_words]
counts = [word[1] for word in top_words]
percentages = [count/total_words*100 for count in counts]
plt.bar(words, percentages)
plt.xlabel('Words')
plt.ylabel('Frequency Percentage')
plt.title('Top 10 Most Frequent Words')
plt.xticks(rotation=90)
for i, v in enumerate(percentages):
plt.text(i, v, str(round(v)) + "%", va='bottom', ha='center')
plt.show()
In [42]:
# Scatter plot of views against likes
plt.scatter(video_df["viewCount"], video_df["likeCount"])
plt.xlabel("Views")
plt.ylabel("Likes")
plt.title("Views vs Likes")
plt.show()
In [43]:
# Scatter plot of views against comments
plt.scatter(video_df["viewCount"], video_df["commentCount"])
plt.xlabel("Views")
plt.ylabel("Comments")
plt.title("Views vs Comments")
plt.show()
In [44]:
# Scatter plot of views against duration
plt.scatter(video_df["durationSecs"], video_df["viewCount"])
plt.xlabel("Duration (Seconds)")
plt.ylabel("Views")
plt.title("Scatter Plot Of Views Against Duration")
plt.show()
In [45]:
# Scatter plot of likes against comments
plt.scatter(video_df["likeCount"], video_df["commentCount"])
plt.xlabel("Likes")
plt.ylabel("Comments")
plt.title("Scatter Plot Of Likes Against Comments")
plt.show()
In [46]:
# Create a time series plot of the published dates of videos
# Convert the "publishedAt" column to datetime format
video_df["publishedAt"] = pd.to_datetime(video_df["publishedAt"])
# Set the "publishedAt" column as the index of the dataframe
video_df.set_index("publishedAt", inplace=True)
# Resample the data by month and plot the mean number of views
video_df["viewCount"].resample("M").mean().plot()
plt.xlabel("Published Date")
plt.ylabel("Mean Views")
plt.title("Time Series Of Mean Views By Month")
plt.show()
In [47]:
# 3D scatter plot to determine if there is a relationship between views, likes, and duration.
from mpl_toolkits.mplot3d import Axes3D
# create the figure and 3D axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
# create the scatter plot
ax.scatter(video_df["viewCount"], video_df["likeCount"], video_df["durationSecs"])
# set axis labels
ax.set_xlabel("Views")
ax.set_ylabel("Likes")
ax.set_zlabel("Duration")
plt.title("3D Scatter Plot Of Views, Likes And Duration")
plt.show()
In [ ]:
In [ ]: