from googleapiclient.discovery import build
import pandas as pd
from IPython.display import JSON
import json
import requests
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
from collections import Counter
import seaborn as sns
import time

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cbots\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cbots\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!

api_key = 'AIzaSyB52Lo63T7JgqjIwetBbRMxGOODhOJ2NDo'

channel_id = 'UCweCc7bSMX5J4jEH7HFImng'

api_service_name = "youtube"
api_version = "v3"
    
# Get credentials and create an API client
youtube = build(api_service_name, api_version, developerKey=api_key)

# This function returns a DataFrame with basic channel information:channelName,subscribers,views,totalVideos,playlistId

def get_channel_stats(youtube, channel_id):
    all_data = []
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id= channel_id
        )
    response = request.execute()
    # loop through items
    for item in response['items']:
        data = {'channelName': item['snippet']['title'],
               'subscribers': item['statistics']['subscriberCount'],
               'views': item['statistics']['viewCount'],
               'totalVideos': item['statistics']['videoCount'],
               'playlistId': item['contentDetails']['relatedPlaylists']['uploads']
               }
        all_data.append(data)
    return(pd.DataFrame(all_data))

channel_stats = get_channel_stats(youtube, channel_id)

channel_stats

def total_videos(youtube, channel_id):
    request = youtube.channels().list(
        part="snippet,contentDetails,statistics",
        id = channel_id
    )
    response = request.execute()
    for item in response['items']:
        total_videos = item['statistics']['videoCount']
    return total_videos

videos = total_videos(youtube, channel_id)
videos

'2296'

# This code returns the playlist_id of the channel

def get_playlist_id(youtube, channel_id):
    playlist_id = []
    request = youtube.channels().list(
        part="snippet,contentDetails",
        id = channel_id
    )
    response = request.execute()
    for item in response['items']:
        if 'contentDetails' in item and 'relatedPlaylists' in item['contentDetails']:
            playlists = item['contentDetails']['relatedPlaylists']
            playlist_id.extend(list(playlists.values()))
    return playlist_id

playlist_id = get_playlist_id(youtube, channel_id)

playlist_id

['', 'UUweCc7bSMX5J4jEH7HFImng']

# This code retrieves 50 video_ids per request(maxResults=50).We put a restriction
# so that we don't exceed the YouTube API default quota allocation of 10,000 units per day.

# def get_video_ids(youtube, channel_id):
#     video_ids = []
#     next_page_token = None
#     while True:
#         request = youtube.search().list(
#             part="id",
#             channelId=channel_id,
#             type='video',
#             maxResults=50,
#             pageToken=next_page_token
#         )
#         response = request.execute()
#         page_video_count = len(response['items'])
#         print(f"Number of video IDs in this page: {page_video_count}")
#         for item in response['items']:
#             video_ids.append(item['id']['videoId'])
#         next_page_token = response.get('nextPageToken', None)
#         print(f"nextPageToken: {next_page_token}")
#         if not next_page_token:
#             break
#         time.sleep(30)
#     return video_ids


def get_video_ids(youtube, channel_id):
    video_ids = []
    processed_video_ids = set()

    # Try to load the list of processed video IDs from a file
    try:
        with open('processed_video_ids.txt', 'r') as file:
            for line in file:
                processed_video_ids.add(line.strip())
    except FileNotFoundError:
        # If the file doesn't exist, just create an empty set
        pass

    next_page_token = None
    while True:
        request = youtube.search().list(
            part="id",
            channelId=channel_id,
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            video_id = item['id']['videoId']
            if video_id not in processed_video_ids:
                video_ids.append(video_id)
                processed_video_ids.add(video_id)

        next_page_token = response.get('nextPageToken', None)
        if not next_page_token:
            break
        time.sleep(5)

    # Save the list of processed video IDs to a file
    with open('processed_video_ids.txt', 'w') as file:
        for video_id in processed_video_ids:
            file.write(f"{video_id}\n")

    return video_ids

total_video_ids = get_video_ids(youtube, channel_id)

len(total_video_ids)

496

# This code returns a DataFrame with important channel information.

def get_video_details(youtube, total_video_ids):
    all_video_info = []
    for i in range(0, len(total_video_ids), 50):
        request = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        id=','.join(total_video_ids[i:i+50])
        )
        response = request.execute() 

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                            }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)
    
    return pd.DataFrame(all_video_info)

video_df = get_video_details(youtube, total_video_ids)
video_df

# Check for NULL values

video_df.isnull().any()

video_id          False
channelTitle      False
title             False
description       False
tags              False
publishedAt       False
viewCount         False
likeCount         False
favouriteCount     True
commentCount      False
duration          False
definition        False
caption           False
dtype: bool

# Check data types

video_df.dtypes

video_id          object
channelTitle      object
title             object
description       object
tags              object
publishedAt       object
viewCount         object
likeCount         object
favouriteCount    object
commentCount      object
duration          object
definition        object
caption           object
dtype: object

# Convert count columns to numeric

numeric_cols = ['viewCount', 'likeCount', 'favouriteCount', 'commentCount']
video_df[numeric_cols] = video_df[numeric_cols].apply(pd.to_numeric, errors = 'coerce', axis = 1)

# Extract weekday name from publish date

from dateutil import parser
video_df['publishedAt'] = video_df['publishedAt'].apply(lambda x: parser.parse(x)) 
video_df['pushblishDayName'] = video_df['publishedAt'].apply(lambda x: x.strftime("%A"))

# Convert duration to seconds

import isodate
video_df['durationSecs'] = video_df['duration'].apply(lambda x: isodate.parse_duration(x))
video_df['durationSecs'] = video_df['durationSecs'].astype('timedelta64[s]')

# Add tagCount column

video_df['tagCount'] = video_df['tags'].apply(lambda x: 0 if x is None else len(x))

video_df.sort_values('viewCount',ascending=False)

# Best Performing Videos

ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=False)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000000) + 'M'))

# Worst Performing Videos

ax = sns.barplot(x = 'title', y = 'viewCount', data = video_df.sort_values('viewCount', ascending=True)[0:9])
plot = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos:'{:,.0f}'.format(x/1000) + 'K'))

# View Distribution Per Video

sns.violinplot(video_df['channelTitle'], video_df['viewCount'])

C:\Users\cbots\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='channelTitle', ylabel='viewCount'>

# Views vs. Comments And Likes

fig, ax = plt.subplots(1,2)
sns.scatterplot(data = video_df, x = 'commentCount', y = 'viewCount', ax = ax[0])
sns.scatterplot(data = video_df, x = 'likeCount', y = 'viewCount', ax = ax[1])

<AxesSubplot:xlabel='likeCount', ylabel='viewCount'>

# Video Duration

sns.histplot(data = video_df, x = 'durationSecs', bins=30)

<AxesSubplot:xlabel='durationSecs', ylabel='Count'>

# Wordcloud For Video Titles

stop_words = set(stopwords.words('english'))
video_df['title_no_stopwords'] = video_df['title'].apply(lambda x: [item for item in str(x).split() if item not in stop_words])

all_words = list([a for b in video_df['title_no_stopwords'].tolist() for a in b])
all_words_str = ' '.join(all_words)
knight_shape = np.array(Image.open("knight_shape.png"))
def plot_cloud(wordcloud):
    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud) 
    plt.axis("off");

wordcloud = WordCloud(random_state=1, background_color='black', 
                      colormap='viridis', collocations=False,mask=knight_shape,
                      contour_width=0.5,contour_color='yellow').generate(all_words_str)
plot_cloud(wordcloud)
# knight_shape = np.array(Image.open("knight_shape4.png"))
# wc = WordCloud(width=2000, height=1000, random_state=1,colormap='viridis',background_color='white',
#                   stopwords=stop_words,collocations=False,mask=knight_shape,contour_width=1,contour_color='black')
# wc.generate(all_words_str)
# image_colors = ImageColorGenerator(knight_shape)
# wc.recolor(color_func = image_colors)
# plt.imshow(wc, interpolation='bilinear')
# plt.axis('off')
# plt.show()

# Upload Schedule

day_df = pd.DataFrame(video_df['pushblishDayName'].value_counts())
weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_df = day_df.reindex(weekdays)
ax = day_df.reset_index().plot.bar(x='index', y='pushblishDayName', rot=0)

# List of the 10 most frequent words and their frequency percentages


# Concatenate the title and description columns and change the column type to string.
text = video_df['title'].str.cat(video_df['description'], sep=' ').to_string()

# Tokenize the text and remove stop words
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

# Get the word frequency count
word_frequency = Counter(tokens)
total_words = sum(word_frequency.values())

# Get the top 10 most frequent words
top_words = word_frequency.most_common(10)

# Plot the frequency of the top 10 words
words = [word[0] for word in top_words]
counts = [word[1] for word in top_words]
percentages = [count/total_words*100 for count in counts]

plt.bar(words, percentages)
plt.xlabel('Words')
plt.ylabel('Frequency Percentage')
plt.title('Top 10 Most Frequent Words')
plt.xticks(rotation=90)
for i, v in enumerate(percentages):
    plt.text(i, v, str(round(v)) + "%", va='bottom', ha='center')
plt.show()

# Scatter plot of views against likes

plt.scatter(video_df["viewCount"], video_df["likeCount"])
plt.xlabel("Views")
plt.ylabel("Likes")
plt.title("Views vs Likes")
plt.show()

# Scatter plot of views against comments

plt.scatter(video_df["viewCount"], video_df["commentCount"])
plt.xlabel("Views")
plt.ylabel("Comments")
plt.title("Views vs Comments")
plt.show()

# Scatter plot of views against duration

plt.scatter(video_df["durationSecs"], video_df["viewCount"])
plt.xlabel("Duration (Seconds)")
plt.ylabel("Views")
plt.title("Scatter Plot Of Views Against Duration")
plt.show()

# Scatter plot of likes against comments

plt.scatter(video_df["likeCount"], video_df["commentCount"])
plt.xlabel("Likes")
plt.ylabel("Comments")
plt.title("Scatter Plot Of Likes Against Comments")
plt.show()

# Create a time series plot of the published dates of videos

# Convert the "publishedAt" column to datetime format
video_df["publishedAt"] = pd.to_datetime(video_df["publishedAt"])

# Set the "publishedAt" column as the index of the dataframe
video_df.set_index("publishedAt", inplace=True)

# Resample the data by month and plot the mean number of views
video_df["viewCount"].resample("M").mean().plot()
plt.xlabel("Published Date")
plt.ylabel("Mean Views")
plt.title("Time Series Of Mean Views By Month")
plt.show()

# 3D scatter plot to determine if there is a relationship between views, likes, and duration.

from mpl_toolkits.mplot3d import Axes3D
# create the figure and 3D axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# create the scatter plot
ax.scatter(video_df["viewCount"], video_df["likeCount"], video_df["durationSecs"])

# set axis labels
ax.set_xlabel("Views")
ax.set_ylabel("Likes")
ax.set_zlabel("Duration")
plt.title("3D Scatter Plot Of Views, Likes And Duration")
plt.show()

	video_id	channelTitle	title	description	tags	publishedAt	viewCount	likeCount	favouriteCount	commentCount	duration	definition	caption
0	xfWkP1LsYpY	GMHikaru	Titled Tuesday Part 2, The Race to the Finish	Titled Tuesday Part 2, The Race to the Finish\...	[Hikaru Nakamura, master chess, chess instruct...	2022-01-27T21:15:02Z	61090	1245	None	54	PT50M49S	hd	true
1	dyuxOOAB5TE	GMHikaru	How Not to Mate!	♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-15T22:15:02Z	59036	3048	None	45	PT28S	hd	false
2	PNr9Olps03E	GMHikaru	Even Hikaru Can't Get En Peasant	Hikaru reviews a blunder he nearly made\n\n♟️ ...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-14T10:15:01Z	87625	5179	None	83	PT26S	hd	false
3	S-f1YdZ35pU	GMHikaru	Chess Is About Breaking the Rules \| Botez Gamb...	Hikaru speedruns without a queen and reminds u...	[Hikaru Nakamura, master chess, chess instruct...	2022-01-31T22:45:01Z	104431	2457	None	145	PT20M14S	hd	true
4	LKQ2F_KEKtg	GMHikaru	Andrea Botez Refuses Hikaru	♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-13T22:15:00Z	65773	2866	None	32	PT27S	hd	false
...	...	...	...	...	...	...	...	...	...	...	...	...	...
491	kGfk2f__t8M	GMHikaru	GM Hikaru Nakamura vs RUSSIA \| Speed Chess Sup...	GM Hikaru Nakamura vs RUSSIA \| Speed Chess Sup...	[Hikaru Nakamura, master chess, chess instruct...	2020-10-09T06:45:01Z	1189219	7116	None	409	PT3H14M46S	hd	false
492	cJI6ytWi9L8	GMHikaru	Hikaru's on Fire! Check Out This Mate!!	Check out this wild mate in Titled Tuesday Ear...	[Hikaru Nakamura, master chess, chess instruct...	2022-07-15T07:15:01Z	318437	3615	None	167	PT2H3M23S	hd	true
493	WMnPXSZ-_AY	GMHikaru	Self Mating Ain't Easy	Hikaru solves a deviously difficult self-mate ...	[Hikaru Nakamura, master chess, chess instruct...	2021-06-22T20:15:01Z	83530	2729	None	173	PT20M58S	hd	true
494	CI3ZEVIIGy0	GMHikaru	Polymarket Magnus vs Wesley So Final with Hika...	In the FTX Crypto Finale Magnus Carlsen played...	[hikaru nakamura, hikaru chess commentary, che...	2021-06-05T08:15:00Z	137265	2005	None	128	PT2H57M25S	hd	true
495	TqJikTiDZE8	GMHikaru	Sacrifices Were Made vs Anish Giri	Sacrifices Were Made by Hikaru in Rapid Chess ...	[Hikaru Nakamura, master chess, chess instruct...	2022-05-13T19:30:01Z	86697	1936	None	100	PT51M1S	hd	true

	video_id	channelTitle	title	description	tags	publishedAt	viewCount	likeCount	favouriteCount	commentCount	duration	definition	caption	pushblishDayName	durationSecs	tagCount
311	dJmlDPamgso	GMHikaru	I'm Already Live! \| 4 Player Chess with @Botez...	4 Player Chess with Hikaru, Alexandra Botez, G...	[chess, Hikaru Nakamura, grandmaster, super-gr...	2020-06-13 15:00:12+00:00	5878000.0	114563.0	NaN	2692.0	PT50M58S	hd	true	Saturday	3058.0	30
243	C-DXAaSiGxA	GMHikaru	How To Troll A Super Grandmaster	Hikaru plays more games against Coach Dominic....	[Hikaru Nakamura, chess game play, monroe Stre...	2021-09-19 08:15:00+00:00	5707321.0	112914.0	NaN	3140.0	PT10M31S	hd	true	Sunday	631.0	26
220	mZ5yqDtkQXI	GMHikaru	HikaruBot AI Plays Hikaru's Openings Against Him	Hikaru attempts to beat his shadowy and myster...	[chess, Hikaru Nakamura, grandmaster, super-gr...	2020-07-09 14:45:00+00:00	5692581.0	135817.0	NaN	4522.0	PT15M50S	hd	true	Thursday	950.0	35
254	EYB7NOSY-hI	GMHikaru	Mittens The Chess Bot Will Make You Quit Chess	Hikaru plays against the 1 elo rated Mittens c...	[Hikaru Nakamura, master chess, chess instruct...	2023-01-04 12:00:13+00:00	4418693.0	121437.0	NaN	3885.0	PT21M44S	hd	true	Wednesday	1304.0	16
317	9iB4mp5_ANI	GMHikaru	15 Year Old Causes Carlsen to Lose His Cool	Hikaru watches the Chessbase India video15-yea...	[Hikaru Nakamura, master chess, chess instruct...	2020-12-15 07:45:00+00:00	4402255.0	83450.0	NaN	2809.0	PT15M55S	hd	true	Tuesday	955.0	17
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11	7FhuQcRhzyY	GMHikaru	It Doesn't Fit in the Hole!	♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-14 22:15:00+00:00	33051.0	1788.0	NaN	41.0	PT38S	hd	false	Tuesday	38.0	12
35	mN-guPtzZh0	GMHikaru	What is the Wooden Shield? xqcL	@fionchetta and @JonLudvigHammerChess explain ...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-16 11:18:13+00:00	31949.0	1106.0	NaN	21.0	PT44S	hd	false	Thursday	44.0	15
84	BDOKWgL25SU	GMHikaru	Firouzja in Fashion?	Hikaru reads a @chess article on Firouzja's Ne...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-15 23:15:00+00:00	30308.0	877.0	NaN	108.0	PT10M37S	hd	false	Wednesday	637.0	19
244	jZgGV3BMKVA	GMHikaru	Here or here?	♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-19 22:15:02+00:00	25566.0	1716.0	NaN	37.0	PT16S	hd	false	Sunday	16.0	12
266	99dCcy9FBOE	GMHikaru	Beating 400s With 6 Knights	♟️ LEARN CHESS & PLAY WITH ME ► https://go.ch...	[Hikaru Nakamura, master chess, chess instruct...	2023-02-20 10:15:00+00:00	21894.0	1538.0	NaN	21.0	PT28S	hd	false	Monday	28.0	12