In [1]:
from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from collections import Counter
import seaborn as sns
import time
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cbots\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cbots\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [2]:
api_key = 'AIzaSyB52Lo63T7JgqjIwetBbRMxGOODhOJ2NDo'
In [3]:
channel_id = 'UCweCc7bSMX5J4jEH7HFImng'
In [4]:
api_service_name = "youtube"
api_version = "v3"
    
# Get credentials and create an API client
youtube = build(api_service_name, api_version, developerKey=api_key)
In [5]:
# This function returns a DataFrame with basic channel information:channelName,subscribers,views,totalVideos,playlistId

def get_channel_stats(youtube, channel_id):
    all_data = []
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id= channel_id
        )
    response = request.execute()
    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
               'subscribers': item['statistics']['subscriberCount'],
               'views': item['statistics']['viewCount'],
               'totalVideos': item['statistics']['videoCount'],
               'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
               }
        all_data.append(data)
    return(pd.DataFrame(all_data))
In [6]:
channel_stats = get_channel_stats(youtube, channel_id)
In [7]:
channel_stats
Out[7]:
channelName subscribers views totalVideos playlistId
0 GMHikaru 1760000 474828454 2296 UUweCc7bSMX5J4jEH7HFImng
In [8]:
def total_videos(youtube, channel_id):
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id = channel_id
    )
    response = request.execute()
    for item in response['items']:
        total_videos = item['statistics']['videoCount']
    return total_videos
In [9]:
videos = total_videos(youtube, channel_id)
videos
Out[9]:
'2296'
In [10]:
# This code returns the playlist_id of the channel

def get_playlist_id(youtube, channel_id):
    playlist_id = []
    request = youtube.channels().list(
        part="snippet,contentDetails",
        id = channel_id
    )
    response = request.execute()
    for item in response['items']:
        if 'contentDetails' in item and 'relatedPlaylists' in item['contentDetails']:
            playlists = item['contentDetails']['relatedPlaylists']
            playlist_id.extend(list(playlists.values()))
    return playlist_id
In [11]:
playlist_id = get_playlist_id(youtube, channel_id)
In [12]:
playlist_id
Out[12]:
['', 'UUweCc7bSMX5J4jEH7HFImng']
In [13]:
# This code retrieves 50 video_ids per request(maxResults=50).We put a restriction
# so that we don't exceed the YouTube API default quota allocation of 10,000 units per day.

# def get_video_ids(youtube, channel_id):
#     video_ids = []
#     next_page_token = None
#     while True:
#         request = youtube.search().list(
#             part="id",
#             channelId=channel_id,
#             type='video',
#             maxResults=50,
#             pageToken=next_page_token
#         )
#         response = request.execute()
#         page_video_count = len(response['items'])
#         print(f"Number of video IDs in this page: {page_video_count}")
#         for item in response['items']:
#             video_ids.append(item['id']['videoId'])
#         next_page_token = response.get('nextPageToken', None)
#         print(f"nextPageToken: {next_page_token}")
#         if not next_page_token:
#             break
#         time.sleep(30)
#     return video_ids


def get_video_ids(youtube, channel_id):
    video_ids = []
    processed_video_ids = set()

    # Try to load the list of processed video IDs from a file
    try:
        with open('processed_video_ids.txt', 'r') as file:
            for line in file:
                processed_video_ids.add(line.strip())
    except FileNotFoundError:
        # If the file doesn't exist, just create an empty set
        pass

    next_page_token = None
    while True:
        request = youtube.search().list(
            part="id",
            channelId=channel_id,
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            video_id = item['id']['videoId']
            if video_id not in processed_video_ids:
                video_ids.append(video_id)
                processed_video_ids.add(video_id)

        next_page_token = response.get('nextPageToken', None)
        if not next_page_token:
            break
        time.sleep(5)

    # Save the list of processed video IDs to a file
    with open('processed_video_ids.txt', 'w') as file:
        for video_id in processed_video_ids:
            file.write(f"{video_id}\n")

    return video_ids
In [ ]:
total_video_ids = get_video_ids(youtube, channel_id)
In [15]:
len(total_video_ids)
Out[15]:
496
In [25]:
# This code returns a DataFrame with important channel information.

def get_video_details(youtube, total_video_ids):
    all_video_info = []
    for i in range(0, len(total_video_ids), 50):
        request = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=','.join(total_video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)
In [26]:
video_df = get_video_details(youtube, total_video_ids)
video_df
Out[26]:
video_id channelTitle title description tags publishedAt viewCount likeCount favouriteCount commentCount duration definition caption
0 xfWkP1LsYpY GMHikaru Titled Tuesday Part 2, The Race to the Finish Titled Tuesday Part 2, The Race to the Finish\... [Hikaru Nakamura, master chess, chess instruct... 2022-01-27T21:15:02Z 61090 1245 None 54 PT50M49S hd true
1 dyuxOOAB5TE GMHikaru How Not to Mate! ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... [Hikaru Nakamura, master chess, chess instruct... 2023-02-15T22:15:02Z 59036 3048 None 45 PT28S hd false
2 PNr9Olps03E GMHikaru Even Hikaru Can't Get En Peasant Hikaru reviews a blunder he nearly made\n\n♟️ ... [Hikaru Nakamura, master chess, chess instruct... 2023-02-14T10:15:01Z 87625 5179 None 83 PT26S hd false
3 S-f1YdZ35pU GMHikaru Chess Is About Breaking the Rules | Botez Gamb... Hikaru speedruns without a queen and reminds u... [Hikaru Nakamura, master chess, chess instruct... 2022-01-31T22:45:01Z 104431 2457 None 145 PT20M14S hd true
4 LKQ2F_KEKtg GMHikaru Andrea Botez Refuses Hikaru ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... [Hikaru Nakamura, master chess, chess instruct... 2023-02-13T22:15:00Z 65773 2866 None 32 PT27S hd false
... ... ... ... ... ... ... ... ... ... ... ... ... ...
491 kGfk2f__t8M GMHikaru GM Hikaru Nakamura vs RUSSIA | Speed Chess Sup... GM Hikaru Nakamura vs RUSSIA | Speed Chess Sup... [Hikaru Nakamura, master chess, chess instruct... 2020-10-09T06:45:01Z 1189219 7116 None 409 PT3H14M46S hd false
492 cJI6ytWi9L8 GMHikaru Hikaru's on Fire! Check Out This Mate!! Check out this wild mate in Titled Tuesday Ear... [Hikaru Nakamura, master chess, chess instruct... 2022-07-15T07:15:01Z 318437 3615 None 167 PT2H3M23S hd true
493 WMnPXSZ-_AY GMHikaru Self Mating Ain't Easy Hikaru solves a deviously difficult self-mate ... [Hikaru Nakamura, master chess, chess instruct... 2021-06-22T20:15:01Z 83530 2729 None 173 PT20M58S hd true
494 CI3ZEVIIGy0 GMHikaru Polymarket Magnus vs Wesley So Final with Hika... In the FTX Crypto Finale Magnus Carlsen played... [hikaru nakamura, hikaru chess commentary, che... 2021-06-05T08:15:00Z 137265 2005 None 128 PT2H57M25S hd true
495 TqJikTiDZE8 GMHikaru Sacrifices Were Made vs Anish Giri Sacrifices Were Made by Hikaru in Rapid Chess ... [Hikaru Nakamura, master chess, chess instruct... 2022-05-13T19:30:01Z 86697 1936 None 100 PT51M1S hd true

496 rows × 13 columns

In [27]:
# Check for NULL values

video_df.isnull().any()
Out[27]:
video_id          False
channelTitle      False
title             False
description       False
tags              False
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool
In [28]:
# Check data types

video_df.dtypes
Out[28]:
video_id          object
channelTitle      object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favouriteCount    object
commentCount      object
duration          object
definition        object
caption           object
dtype: object
In [29]:
# Convert count columns to numeric

numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)
In [30]:
# Extract weekday name from publish date

from dateutil import parser
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))
In [31]:
# Convert duration to seconds

import isodate
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')
In [32]:
# Add tagCount column

video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))
In [33]:
video_df.sort_values('viewCount',ascending=False)
Out[33]:
video_id channelTitle title description tags publishedAt viewCount likeCount favouriteCount commentCount duration definition caption pushblishDayName durationSecs tagCount
311 dJmlDPamgso GMHikaru I'm Already Live! | 4 Player Chess with @Botez... 4 Player Chess with Hikaru, Alexandra Botez, G... [chess, Hikaru Nakamura, grandmaster, super-gr... 2020-06-13 15:00:12+00:00 5878000.0 114563.0 NaN 2692.0 PT50M58S hd true Saturday 3058.0 30
243 C-DXAaSiGxA GMHikaru How To Troll A Super Grandmaster Hikaru plays more games against Coach Dominic.... [Hikaru Nakamura, chess game play, monroe Stre... 2021-09-19 08:15:00+00:00 5707321.0 112914.0 NaN 3140.0 PT10M31S hd true Sunday 631.0 26
220 mZ5yqDtkQXI GMHikaru HikaruBot AI Plays Hikaru's Openings Against Him Hikaru attempts to beat his shadowy and myster... [chess, Hikaru Nakamura, grandmaster, super-gr... 2020-07-09 14:45:00+00:00 5692581.0 135817.0 NaN 4522.0 PT15M50S hd true Thursday 950.0 35
254 EYB7NOSY-hI GMHikaru Mittens The Chess Bot Will Make You Quit Chess Hikaru plays against the 1 elo rated Mittens c... [Hikaru Nakamura, master chess, chess instruct... 2023-01-04 12:00:13+00:00 4418693.0 121437.0 NaN 3885.0 PT21M44S hd true Wednesday 1304.0 16
317 9iB4mp5_ANI GMHikaru 15 Year Old Causes Carlsen to Lose His Cool Hikaru watches the Chessbase India video15-yea... [Hikaru Nakamura, master chess, chess instruct... 2020-12-15 07:45:00+00:00 4402255.0 83450.0 NaN 2809.0 PT15M55S hd true Tuesday 955.0 17
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
11 7FhuQcRhzyY GMHikaru It Doesn't Fit in the Hole! ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... [Hikaru Nakamura, master chess, chess instruct... 2023-02-14 22:15:00+00:00 33051.0 1788.0 NaN 41.0 PT38S hd false Tuesday 38.0 12
35 mN-guPtzZh0 GMHikaru What is the Wooden Shield? xqcL @fionchetta and @JonLudvigHammerChess explain ... [Hikaru Nakamura, master chess, chess instruct... 2023-02-16 11:18:13+00:00 31949.0 1106.0 NaN 21.0 PT44S hd false Thursday 44.0 15
84 BDOKWgL25SU GMHikaru Firouzja in Fashion? Hikaru reads a @chess article on Firouzja's Ne... [Hikaru Nakamura, master chess, chess instruct... 2023-02-15 23:15:00+00:00 30308.0 877.0 NaN 108.0 PT10M37S hd false Wednesday 637.0 19
244 jZgGV3BMKVA GMHikaru Here or here? ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... [Hikaru Nakamura, master chess, chess instruct... 2023-02-19 22:15:02+00:00 25566.0 1716.0 NaN 37.0 PT16S hd false Sunday 16.0 12
266 99dCcy9FBOE GMHikaru Beating 400s With 6 Knights ♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch... [Hikaru Nakamura, master chess, chess instruct... 2023-02-20 10:15:00+00:00 21894.0 1538.0 NaN 21.0 PT28S hd false Monday 28.0 12

496 rows × 16 columns

In [34]:
# Best Performing Videos

ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000000) + 'M'))
No description has been provided for this image
In [35]:
# Worst Performing Videos

ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000) + 'K'))
No description has been provided for this image
In [36]:
# View Distribution Per Video

sns.violinplot(video_df['channelTitle'], video_df['viewCount'])
C:\Users\cbots\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[36]:
<AxesSubplot:xlabel='channelTitle', ylabel='viewCount'>
No description has been provided for this image
In [37]:
# Views vs. Comments And Likes

fig, ax = plt.subplots(1,2)
sns.scatterplot(data = video_df, x = 'commentCount', y = 'viewCount', ax = ax[0])
sns.scatterplot(data = video_df, x = 'likeCount', y = 'viewCount', ax = ax[1])
Out[37]:
<AxesSubplot:xlabel='likeCount', ylabel='viewCount'>
No description has been provided for this image
In [38]:
# Video Duration

sns.histplot(data = video_df, x = 'durationSecs', bins=30)
Out[38]:
<AxesSubplot:xlabel='durationSecs', ylabel='Count'>
No description has been provided for this image
In [39]:
# Wordcloud For Video Titles

stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words)
knight_shape = np.array(Image.open("knight_shape.png"))
def plot_cloud(wordcloud):
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud) 
    plt.axis("off");

wordcloud = WordCloud(random_state=1, background_color='black', 
                      colormap='viridis', collocations=False,mask=knight_shape,
                      contour_width=0.5,contour_color='yellow').generate(all_words_str)
plot_cloud(wordcloud)
# knight_shape = np.array(Image.open("knight_shape4.png"))
# wc = WordCloud(width=2000, height=1000, random_state=1,colormap='viridis',background_color='white',
#                   stopwords=stop_words,collocations=False,mask=knight_shape,contour_width=1,contour_color='black')
# wc.generate(all_words_str)
# image_colors = ImageColorGenerator(knight_shape)
# wc.recolor(color_func = image_colors)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()
No description has been provided for this image
In [40]:
# Upload Schedule

day_df = pd.DataFrame(video_df['pushblishDayName'].value_counts())
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_df = day_df.reindex(weekdays)
ax = day_df.reset_index().plot.bar(x='index', y='pushblishDayName', rot=0)
No description has been provided for this image
In [41]:
# List of the 10 most frequent words and their frequency percentages


# Concatenate the title and description columns and change the column type to string.
text = video_df['title'].str.cat(video_df['description'], sep=' ').to_string()

# Tokenize the text and remove stop words
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

# Get the word frequency count
word_frequency = Counter(tokens)
total_words = sum(word_frequency.values())

# Get the top 10 most frequent words
top_words = word_frequency.most_common(10)

# Plot the frequency of the top 10 words
words = [word[0] for word in top_words]
counts = [word[1] for word in top_words]
percentages = [count/total_words*100 for count in counts]

plt.bar(words, percentages)
plt.xlabel('Words')
plt.ylabel('Frequency Percentage')
plt.title('Top 10 Most Frequent Words')
plt.xticks(rotation=90)
for i, v in enumerate(percentages):
    plt.text(i, v, str(round(v)) + "%", va='bottom', ha='center')
plt.show()
No description has been provided for this image
In [42]:
# Scatter plot of views against likes

plt.scatter(video_df["viewCount"], video_df["likeCount"])
plt.xlabel("Views")
plt.ylabel("Likes")
plt.title("Views vs Likes")
plt.show()
No description has been provided for this image
In [43]:
# Scatter plot of views against comments

plt.scatter(video_df["viewCount"], video_df["commentCount"])
plt.xlabel("Views")
plt.ylabel("Comments")
plt.title("Views vs Comments")
plt.show()
No description has been provided for this image
In [44]:
# Scatter plot of views against duration

plt.scatter(video_df["durationSecs"], video_df["viewCount"])
plt.xlabel("Duration (Seconds)")
plt.ylabel("Views")
plt.title("Scatter Plot Of Views Against Duration")
plt.show()
No description has been provided for this image
In [45]:
# Scatter plot of likes against comments

plt.scatter(video_df["likeCount"], video_df["commentCount"])
plt.xlabel("Likes")
plt.ylabel("Comments")
plt.title("Scatter Plot Of Likes Against Comments")
plt.show()
No description has been provided for this image
In [46]:
# Create a time series plot of the published dates of videos

# Convert the "publishedAt" column to datetime format
video_df["publishedAt"] = pd.to_datetime(video_df["publishedAt"])

# Set the "publishedAt" column as the index of the dataframe
video_df.set_index("publishedAt", inplace=True)

# Resample the data by month and plot the mean number of views
video_df["viewCount"].resample("M").mean().plot()
plt.xlabel("Published Date")
plt.ylabel("Mean Views")
plt.title("Time Series Of Mean Views By Month")
plt.show()
No description has been provided for this image
In [47]:
# 3D scatter plot to determine if there is a relationship between views, likes, and duration.

from mpl_toolkits.mplot3d import Axes3D
# create the figure and 3D axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# create the scatter plot
ax.scatter(video_df["viewCount"], video_df["likeCount"], video_df["durationSecs"])

# set axis labels
ax.set_xlabel("Views")
ax.set_ylabel("Likes")
ax.set_zlabel("Duration")
plt.title("3D Scatter Plot Of Views, Likes And Duration")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]: