Web Scrapping Popular Youtube Tech Channels with Selenium
Data Mining, Data Wrangling, Data Exploratory Analysis
Notebook Created by: David Rusho (Github Blog | Tableau | Linkedin)
About the Data
Web scraping was performed on the Top 10 Tech Channels on Youtube using Selenium (an automated browser (driver) controlled using python, which is often used in web scraping and web testing). These channels were selected using a Top 10 Tech Youtubers list from blog.bit.ai.
Data from 2,000 videos was scrapped, which equals about 200 of most popular videos per channel.
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# Chrome driver location (for M1 macbook air)
DRIVER_PATH = "/opt/homebrew/bin/chromedriver"
# activate driver
driver = webdriver.Chrome(executable_path=DRIVER_PATH)
# Scroll to bottom of page
def scroll_page():
for x in range(7):
html = driver.find_element_by_tag_name("html")
html.send_keys(Keys.END)
time.sleep(2)
def scrap_videos():
scroll_page()
chan_xpath = '//*[@id="channel-name"]'
subs_xpath = '//*[@id="subscriber-count"]'
videos_class = "style-scope ytd-grid-video-renderer"
views_xpath = './/*[@id="metadata-line"]/span[1]'
post_date_xpath = './/*[@id="metadata-line"]/span[2]'
title_xpath = './/*[@id="video-title"]'
# Scrap Channel Name
try:
channel_name = driver.find_element_by_xpath(chan_xpath).text
except (Exception,):
pass
# Scrap Number of Subscribers
try:
subscribers = driver.find_element_by_xpath(subs_xpath).text
except (Exception,):
pass
# Reassign variable to recalculate all videos
videos = driver.find_elements_by_class_name(videos_class)
# Loop through all videos
for video in videos:
# grab title if available
try:
title = video.find_element_by_xpath(title_xpath).text
except (Exception,):
pass
# grab url if available
try:
url = video.find_element_by_xpath(title_xpath).get_attribute("href")
except (Exception,):
pass
# grab views if available
try:
views = video.find_element_by_xpath(views_xpath).text
except (Exception,):
pass
# grab post date if available
try:
post_date = video.find_element_by_xpath(post_date_xpath).text
except (Exception,):
pass
video_items = {
"channel_name": channel_name,
"subscribers": subscribers,
"title": title,
"views": views,
"post_date": post_date,
"url": url,
}
vid_list.append(video_items)
return vid_list
# scrap Channel About section
def scrap_about():
chan_name_xp = '//*[@id="channel-name"]'
chan_join = './/*[@id="right-column"]/yt-formatted-string[2]/span[2]'
chan_views = './/*[@id="right-column"]/yt-formatted-string[3]'
chan_desc = './/*[@id="description"]'
# Scrap Channel Name
try:
channel_name = driver.find_element_by_xpath(chan_name_xp).text
except (Exception,):
pass
# Scrap Channel Join Date (about)
try:
channel_join = driver.find_element_by_xpath(chan_join).text
except (Exception,):
pass
# Scrap Channel Views (about)
try:
channel_views = driver.find_element_by_xpath(chan_views).text
except (Exception,):
pass
# Scrap Channel Description (about)
try:
channel_description = driver.find_element_by_xpath(chan_desc).text
except (Exception,):
pass
about_items = {
"channel_name": channel_name,
"channel_join_date": channel_join,
"channel_views": channel_views,
"channel_description": channel_description,
}
vid_list.append(about_items)
return vid_list
# top youtubers based off 'https://blog.bit.ai'
top_youtubers = [
"ijustine",
"AndroidAuthority",
"Mrwhosetheboss",
"TechnoBuffalo",
"TLD",
"austinevans",
"unboxtherapy",
"LinusTechTips",
"UrAvgConsumer",
"mkbhd",
]
# empty list to hold video details
vid_list = []
# url of most videos sorted by most popular
for youtuber in top_youtubers:
print(f"processing {youtuber}")
url = f"https://www.youtube.com/{youtuber}/videos?view=0&sort=p&flow=grid"
driver.get(url)
scroll_page()
vid_list = scrap_videos()
about_url = f"https://www.youtube.com/{youtuber}/about"
about = driver.get(about_url)
driver.implicitly_wait(10)
about_items = scrap_about()
# Close Chrome browser
driver.quit()
# create pandas df for video info
df_channel = pd.DataFrame(vid_list)
# export df to csv
df_channel.to_csv("yt_channel_scrap.csv")
import pandas as pd
import time
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime
from requests import options
from selenium import webdriver
# driver options (size and headless)
options = Options()
options.add_argument("--headless")
options.add_argument("--window-size=1920x1080")
# Chrome driver location (for M1 macbook air)
DRIVER_PATH = "/opt/homebrew/bin/chromedriver"
# activate driver
driver = webdriver.Chrome(executable_path=DRIVER_PATH, options=options)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# partial video description
def par_description():
vid_desc = "//div[@class='watch-main-col']/meta[@itemprop='description']"
elems = driver.find_elements_by_xpath(vid_desc)
for elem in elems:
return elem.get_attribute("content")
# publish_date
def publish():
pub_date = "//div[@class='watch-main-col']/meta[@itemprop='datePublished']"
elems = driver.find_elements_by_xpath(pub_date)
for elem in elems:
return elem.get_attribute("content")
# upload_date
def upload():
upload_date = "//div[@class='watch-main-col']/meta[@itemprop='uploadDate']"
elems = driver.find_elements_by_xpath(upload_date)
for elem in elems:
return elem.get_attribute("content")
# genre
def genre():
genre = "//div[@class='watch-main-col']/meta[@itemprop='genre']"
elems = driver.find_elements_by_xpath(genre)
for elem in elems:
return elem.get_attribute("content")
# video_width
def width():
v_width = "//div[@class='watch-main-col']/meta[@itemprop='width']"
elems = driver.find_elements_by_xpath(v_width)
for elem in elems:
return elem.get_attribute("content")
# video_height
def height():
v_height = "//div[@class='watch-main-col']/meta[@itemprop='height']"
elems = driver.find_elements_by_xpath(v_height)
for elem in elems:
return elem.get_attribute("content")
# Interaction Count
def interactions():
interactions = "//div[@class='watch-main-col']/meta[@itemprop='interactionCount']"
elems = driver.find_elements_by_xpath(interactions)
for elem in elems:
return elem.get_attribute("content")
# Video_title
def video_title():
video_title = "//div[@class='watch-main-col']/meta[@itemprop='name']"
elems = driver.find_elements_by_xpath(video_title)
for elem in elems:
return elem.get_attribute("content")
# Channel_name
def channel_name():
channel_name = (
"//div[@class='watch-main-col']/span[@itemprop='author']/link[@itemprop='name']"
)
elems = driver.find_elements_by_xpath(channel_name)
for elem in elems:
return elem.get_attribute("content")
# Number Likes
def likes():
likes_xpath = "(//div[@id='top-level-buttons-computed']//*[contains(@aria-label,' likes')])[last()]"
return driver.find_element_by_xpath(likes_xpath).text
# Total Comments
def comments():
# Move Page to display comments
# set scroll pause time
SCROLL_PAUSE_TIME = 0.5
# scroll to page bottom
driver.execute_script("window.scrollTo(0, 1080)")
# Wait for page load
time.sleep(SCROLL_PAUSE_TIME)
# scroll to page bottom
driver.execute_script("window.scrollTo(300, 1080)")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
com = WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.XPATH, '//*[@id="count"]/yt-formatted-string')
)
)
return com.text
# import csv of youtube channels data
df_channels = pd.read_csv(
"yt_channel_scrap.csv",
)
# new df of channel names and urls
df_videos = df_channels[["channel_name", "url"]].dropna()
# isolate video urls to a list
url_list = df_videos.url.to_list()
vid_list = []
url_fails_ls = []
count = 0
# # launch driver(s)
for url in url_list:
driver.get(url)
count += 1
time.sleep(3)
subscribe_button = '//*[@id="subscribe-button"]'
WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.XPATH, subscribe_button))
)
try:
comments_num = comments()
likes_num = likes()
chan_name = channel_name()
v_duration = duration()
p_description = par_description()
publish_date = publish()
upload_date = upload()
v_genre = genre()
v_width = width()
v_height = height()
title = video_title()
interaction_count = interactions()
except:
print(f"EXCEPTION RAISED for {url}")
url_fails_ls.append(url)
pass
video_items = {
"url": url, # primary key
"Channel Name": chan_name,
"Title": title,
"Duration": v_duration,
"Partial Description": p_description,
"Publish Date": publish_date,
"Upload_date": upload_date,
"Genre": v_genre,
"Width": v_width,
"Height": v_height,
"Likes": likes_num,
"Comments": comments_num,
"Interaction Count": interaction_count,
}
vid_list.append(video_items)
# print(f"url {count} of {len(url_list)} complete")
# print every 10th url
if count % 10 == 0:
print(f"URL {count} of {len(url_list)} processed.")
driver.quit()
# # create dfs for video and failed urls
df_videos = pd.DataFrame(vid_list)
# store urls that failed to load in driver
url_fails_dict = {"url": url_fails_ls}
df_url_fails = pd.DataFrame(url_fails_dict)
print("Driver Quit")
print("Code Duration: {}".format(end_time - start_time))
print(f"Videos Processed: {len(vid_list)}")
print(f"Failures: {len(url_fails_ls)}")
# export df to csv
df_url_fails.to_csv(
"url_fails.csv"
)
df_videos.to_csv(
"yt_videos_scrap.csv"
)
Importing and Cleaning the Data
Note: Code in the cell below comes from this notebook I created to originally clean and merge the data.
import pandas as pd
# load channel csv
yt = pd.read_csv("yt_channel_scrap.csv", parse_dates=["channel_join_date"])
# create df of Channel details
channel_details = yt[yt.channel_join_date.notna()]
channel_details = channel_details.drop(
columns=["Unnamed: 0", "subscribers", "title", "views", "post_date"]
).reset_index(drop=True)
# create df Video details
video_details = yt[yt.channel_join_date.isna()]
video_details = video_details.drop(
columns=[
"Unnamed: 0",
"channel_join_date",
"channel_views",
"channel_description",
"post_date",
]
).reset_index(drop=True)
# merge dfs
merged = channel_details.merge(video_details, on="channel_name")
# drop 2nd url column and rename remaining url col
merged.drop(columns=("url_x"), inplace=True)
merged.rename(columns={"url_y": "url"}, inplace=True)
# dtypes to float for views and subscribers
merged.subscribers = (
merged.subscribers.str.replace("M subscribers", "").astype("float") * 1000000
)
# modify views col dtype to float
def fix_views(col):
if "M" in col:
return float(col.replace("M views", "")) * 1000000
elif "K" in col:
return float(col.replace("K views", "")) * 1000
elif "1 year ago" in col:
return 0
merged["views"] = merged["views"].apply(fix_views)
# Correct channel view column to display num only
merged["channel_views"] = (
merged["channel_views"].str.replace(",", "").str.replace(" views", "").astype("int")
)
# import Videos csv data
df_videos = pd.read_csv(
"yt_videos_scrap_big_data.csv", parse_dates=["Publish Date", "Upload_date"]
)
df_videos.drop(
columns=["Unnamed: 0", "Duration", "Channel Name", "Title"], inplace=True
)
# comments dytpe to int
df_videos["Comments"] = (
df_videos["Comments"].str.replace("Comments", "").str.replace(",", "").astype("int")
)
# modify likes col dtype to float
def fix_likes(col):
if "M" in col:
return float(col.replace("M", "")) * 1000000
elif "K" in col:
return float(col.replace("K", "")) * 1000
else:
return float(col)
# Fix Likes Column
df_videos["Likes"] = df_videos["Likes"].apply(fix_likes)
# Fix Width and Height, remove '.' and '0' from end of str
df_videos["Width"] = df_videos["Width"].astype("str").str.split(".", expand=True)[0]
df_videos["Height"] = df_videos["Height"].astype("str").str.split(".", expand=True)[0]
vc_merged = merged.merge(df_videos, on="url")
# rename columns to increase readability in analysis plots and tables
vc_merged.rename(
columns={
"channel_name": "Channel Name",
"channel_join_date": "Channel Join Date",
"channel_views": "Channel Views (M)",
"subscribers": "Subscribers (M)",
"Interaction Count": "Interactations (M)",
"views": "Video Views (M)",
"Partial Description": "Video Desc",
"Publish Date": "Publish Date",
"Upload_date": "Upload Date",
"Genre": "Video Genre",
"Width": "Width",
"Height": "Height",
"Comments": "Video Comments",
"title": "Video Title",
"url": "Video URL",
},
inplace=True,
)
# List of Video Channels
yt_chan_jn = (
df.groupby(["Channel Join Date", "Channel Name", "Channel Views (M)"])[
"Subscribers (M)"
]
.max()
.to_frame()
.reset_index()
)
# rename columns to increase readability
yt_chan_jn.rename(
columns={
"Channel Name": "Channel",
"Channel Join Date": "Join Date",
"Subscribers (M)": "Subscribers",
"Channel Views (M)": "Channel Views",
},
inplace=True,
)
yt_chan_jn
# # style dateframe to highlight highest values
yt_chan_jn = yt_chan_jn.style.format(
formatter={"Subscribers": "{:,} M", "Channel Views": "{:,} M"}
).background_gradient(
subset=["Channel Views", "Subscribers"], cmap="Wistia"
).set_caption(
"Youtube Channels Ordered by Join Date"
).set_table_styles(
[dict(selector="caption", props=[("text-align", "center"), ("font-size", "125%")])]
).hide_index()
yt_chan_jn
# Top 10 Videos by Views
top_vwd_chan = (
df.groupby(["Video Title", "Channel Name", "Publish Date"])["Video Views (M)"]
.max()
.sort_values(ascending=False)
.head(10)
.reset_index()
)
# rename columns to increase readability
top_vwd_chan.rename(
columns={"Channel Name": "Channel", "Video Views (M)": "Video Views"}, inplace=True
)
top_vwd_chan.style.format(
formatter={"Video Views": "{:,} M", "Publish Date": "{:%Y-%m-%d}"}
).background_gradient(
subset=["Video Views", "Publish Date"], cmap="Wistia"
).set_caption(
"Top 10 Youtube Videos by Views"
).set_table_styles(
[dict(selector="caption", props=[("text-align", "center"), ("font-size", "125%")])]
).hide_index()
# Total Views by Channel
chan_views = (
df.groupby(["Channel Name", "Subscribers (M)"])["Video Views (M)"]
.sum()
.sort_values(ascending=False)
.reset_index()
)
# rename columns to increase readability
chan_views.rename(
columns={
"Channel Name": "Channel",
"Video Views (M)": "Video Views",
"Subscribers (M)": "Subscribers",
},
inplace=True,
)
chan_views.style.format(
formatter={
"Video Views": "{:,}",
"Video Views": "{0:,.0f} M",
"Subscribers": "{:,} M",
}
).background_gradient(subset=["Video Views", "Subscribers"], cmap="Wistia").set_caption(
"Channels Grouped by Video Views"
).set_table_styles(
[dict(selector="caption", props=[("text-align", "center"), ("font-size", "125%")])]
).hide_index()
Top 10 Liked Videos
Note 1: The following top 10 liked videos don't review a tech product.
- "Reflecting on the Color of My Skin" created by Marques Brownlee
- "I've been thinking of retiring" created by Linus Tech Tips
Note 2: Mrwhosetheboss capitalizes "THIS" is a lot of their video titles.
# Top 10 Videos by Views
top_lkd_chan = (
df.groupby(["Video Title", "Channel Name", "Publish Date"])["Likes"]
.max()
.sort_values(ascending=False)
.head(10)
.reset_index()
)
# rename columns to increase readability
top_lkd_chan.rename(columns={"Channel Name": "Channel"}, inplace=True)
top_lkd_chan.style.format(
formatter={"Likes": "{:,}", "Publish Date": "{:%Y-%m-%d}"}
).background_gradient(subset=["Likes", "Publish Date"], cmap="Wistia").set_caption(
"Top 10 Liked Videos"
).set_table_styles(
[dict(selector="caption", props=[("text-align", "center"), ("font-size", "125%")])]
).hide_index()
# Top Video Likes Over Time (Scatter Plot)
import plotly.express as px
import plotly.graph_objects as go
# set global plot colors
# plotly marker colors
mcolors = "#1f77b4" # light blue
# wordcloud letters
cmaps = "Wistia"
cmaps_r = "Wistia_r"
# plotly backround
wtbckgnd = {"plot_bgcolor": "rgba(255,255,255, 0.9)"} # white background
blkbackground = {"plot_bgcolor": "rgba(0, 0, 0, 0.5)"} # black background
fig = px.scatter(
df,
y="Likes",
x="Publish Date",
color="Likes",
hover_name="Video Title",
hover_data=["Channel Name"],
color_continuous_scale="solar_r",
)
fig.update_layout(
wtbckgnd, # set background to white
title={
"text": "Top Video Likes Over Time",
"y": 0.88,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
xaxis_title="Video Publish Date",
yaxis_title="No. of Likes",
)
fig.show()
import texthero as hero
from texthero import preprocessing
from texthero import stopwords
# create a custom cleaning pipeline
custom_pipeline = [
preprocessing.fillna
# , preprocessing.lowercase
,
preprocessing.remove_digits,
preprocessing.remove_punctuation,
preprocessing.remove_diacritics
# , preprocessing.remove_stopwords
,
preprocessing.remove_whitespace,
]
# , preprocessing.stem]
default_stopwords = stopwords.DEFAULT
# add a list of stopwords to the stopwords
custom_stopwords = default_stopwords.union(set(["The", "vs"]))
# pass the custom_pipeline to the pipeline argument
df["clean_title"] = hero.clean(df["Video Title"], pipeline=custom_pipeline)
# Call remove_stopwords and pass the custom_stopwords list
df["clean_title"] = hero.remove_stopwords(df["clean_title"], custom_stopwords)
tw = hero.visualization.top_words(df["clean_title"]).head(10).to_frame()
tw.reset_index(inplace=True)
tw.rename(columns={"index": "word", "clean_title": "freq"}, inplace=True)
fig = go.Figure([go.Bar(x=tw.word, y=tw.freq, textposition="auto")])
fig.update_layout(
wtbckgnd, # set background to white
title={
"text": "Word Frequency for 2,000 Video Titles",
"y": 0.88,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis=dict(title="Word Count"),
)
fig.update_traces(marker_color="orange")
import texthero as herofrom
# Word cloud of top words from clean_title
herofrom.wordcloud(
df.clean_title,
max_words=200,
# contour_color='red',
background_color="white",
colormap="Oranges",
height=500,
width=800,
)
# Add pca value to dataframe to use as visualization coordinates
df["pca"] = df["clean_title"].pipe(hero.tfidf).pipe(hero.pca)
# Add k-means cluster to dataframe
df["kmeans"] = df["clean_title"].pipe(hero.tfidf).pipe(hero.kmeans)
hero.scatterplot(df, "pca", color="kmeans", hover_data=["Video Title"])
df.drop(columns=["Unnamed: 0","Width","Height"]).corr().style.background_gradient(
subset=[
"Channel Views (M)",
"Subscribers (M)",
"Video Views (M)",
"Likes",
"Video Comments",
"Interactations (M)",
],
cmap="Wistia",
)
Take Aways
Video Comment numbers have very little correlation to any data that was obtained in this project.
The following seem to be seems to be highly correlated.
- Channel Views and Subscribers
- Interactions and Video Views
Video titles fall into 5 topic groups.
- Iphone (kmeans 0)
- Samsung (kmeans 1)
- Reviews (kmeans 2)
- Unboxing (kmeans 3)
- How-to (kmeans 4)
70% of the the most viewed videos are about phones.
- the Join Date (Date a Youtube Channel was created) does not seem to have any relationship to number of subscribers or overall channel views.