In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd

With this script we extract the 100 Best Sellers from amazon.co.uk(title,author,price)¶

Attention!!!As of 19/2/2023 this script works like a charm.There is no quarantee that it will work tomorrow.Websites constantly change and even a small change in the structure of the website can break the script.The good news is that many changes to websites are small and incremental, so we’ll likely be able to update our scraper with only minimal adjustments.¶

In [3]:
##### Web scrapper that imitates page scrolling for lazy loading pages #####
service = Service(r"C:\Users\cbots\Downloads\MY PORTFOLIO\Amazon Web Scraper Projects\Scraping 100 Best Sellers\chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get("https://www.amazon.co.uk/gp/bestsellers/books/ref=pd_zg_ts_books")
time.sleep(2)  # Allow 2 seconds for the web page to open

####  FIRST PAGE  ####
# Scroll down the first page(this part of the code imitates the scrolling of the mouse)
scroll_pause_time = 1 # the code scrolls down the page with a 1 second delay between scrolls so that the page loads properly
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1
while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height after every scroll, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if screen_height * i > scroll_height:
        break

# Scrape the data from the first page
soup = BeautifulSoup(driver.page_source, "html.parser")
authors1 = soup.findAll(class_=["a-size-small a-link-child", "a-size-small a-color-base"])
page1_authors = [author.get_text(strip=True) for author in authors1]
prices1 = soup.findAll(class_=["_cDEzb_p13n-sc-price_3mJ9Z", "p13n-sc-price"])
page1_prices = [price.get_text(strip=True) for price in prices1]
aTags = soup.find_all('a', {'class': 'a-link-normal'})
page1_titles = []
for tag in aTags:
    # Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'
    divTags1 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'})
    for tag in divTags1:
        page1_titles.append(tag.get_text(strip=True))

    # Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'
    divTags2 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'})
    for tag in divTags2:
        page1_titles.append(tag.get_text(strip=True))

# Combine book titles,authors and prices
if len(page1_titles) == len(page1_authors) == len(page1_prices):
    with open('titles_authors_prices.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for i in range(len(page1_titles)):
            price = page1_prices[i].replace('£', '', 1).strip()
            writer.writerow([f"'{page1_titles[i]}'", page1_authors[i], price])
else:
    print('Error: The number of book titles does not match the number of book authors')

# Pagination-check if there is a Next Page button on the first page
next_button = driver.find_elements(By.CLASS_NAME, "a-last")
if next_button:
    # If there is a next page button, click it
    next_button[0].click()
    time.sleep(3)  # Wait for the second page to load

####  SECOND PAGE  ####
# Scroll down the second page(this part of the code imitates the scrolling of the mouse)
scroll_pause_time = 1 # the code scrolls down the page with a 1 second delay between scrolls so that the page loads properly
screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
i = 1

while True:
    # scroll one screen height each time
    driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
    i += 1
    time.sleep(scroll_pause_time)
    # update scroll height after every scroll, as the scroll height can change after we scrolled the page
    scroll_height = driver.execute_script("return document.body.scrollHeight;")
    # Break the loop when the height we need to scroll to is larger than the total scroll height
    if screen_height * i > scroll_height:
        break

soup = BeautifulSoup(driver.page_source, "html.parser")
authors2 = soup.findAll(class_=["a-size-small a-link-child", "a-size-small a-color-base"])
page2_authors = [author.get_text(strip=True) for author in authors2]
prices2 = soup.findAll(class_=["_cDEzb_p13n-sc-price_3mJ9Z","p13n-sc-price"])
page2_prices = [price.get_text(strip=True) for price in prices2]
aTags = soup.find_all('a', {'class': 'a-link-normal'})
page2_titles = []
for tag in aTags:
    # Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'
    divTags3 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'})
    for tag in divTags3:
        page2_titles.append(tag.get_text(strip=True))

    # Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'
    divTags4 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'})
    for tag in divTags4:
        page2_titles.append(tag.get_text(strip=True))

# Combine book titles,authors and prices
if len(page2_titles) == len(page2_authors) == len(page2_prices):
    with open('titles_authors_prices2.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for i in range(len(page2_titles)):
            price = page2_prices[i].replace('£', '', 1).strip()
            writer.writerow([f"'{page2_titles[i]}'", page2_authors[i], price])
else:
    print('Error: The number of book titles does not match the number of book authors')


# read the data from the first file
df1 = pd.read_csv('titles_authors_prices.csv', delimiter=',', header=None, names=['Book Title', 'Author', 'Price'])

# read the data from the second file
df2 = pd.read_csv('titles_authors_prices2.csv', delimiter=',', header=None, names=['Book Title', 'Author', 'Price'])

# combine the dataframes into one
df = pd.concat([df1, df2], ignore_index=True)

# write the combined data to a CSV file
df.to_csv('best100_titles_authors_prices.csv', index=False)

driver.quit()
In [4]:
pd.set_option('display.max_rows', None) #display all rows
In [5]:
pd.options.display.max_colwidth=200 # change Book Title column width to fit the whole title
In [6]:
df
Out[6]:
Book Title Author Price
0 'Bored of Lunch: The Healthy Air Fryer Book' Nathan Anthony 9.49
1 'Bored of Lunch: The Healthy Slow Cooker Book: THE NUMBER ONE BESTSELLER' Nathan Anthony 9.00
2 'Strong Female Character' Fern Brady 15.63
3 'The World According to Kaleb: THE SUNDAY TIMES BESTSELLER - worldly wisdom from the breakout star of Clarkson’s Farm' Kaleb Cooper 9.00
4 'It Ends With Us: The emotional #1 Sunday Times bestseller (Lily & Atlas, 1)' Colleen Hoover 4.50
5 'Atomic Habits: the life-changing million-copy #1 bestseller' James Clear 9.00
6 'The Ultimate Air Fryer Cookbook: Quick, healthy, energy-saving recipes using UK measurements' Clare Andrews 10.00
7 'Thrown: THE SUNDAY TIMES BESTSELLING novel of friendship, heartbreak and pottery for beginners' Sara Cox 6.74
8 'Dr Rupy Cooks: Over 100 easy, healthy, flavourful recipes' Dr Rupy Aujla 11.00
9 'The Keeper of Stories: The most charming and uplifting novel you will read this year!' Sally Page 3.00
10 'Freezing Order: Vladimir Putin, Russian Money Laundering and Murder - A True Story' Bill Browder 7.99
11 'Why Has Nobody Told Me This Before?: The No 1 Sunday Times Bestseller 2022' Dr Julie Smith 8.00
12 'Spare: by Prince Harry, The Duke of Sussex' Prince Harry The Duke of Sussex 14.00
13 'Menopausing: The Sunday Times bestselling self-help guide for 2022 to help you care for yourself, cope with symptoms, and live your best life during menopause' Davina McCall 11.00
14 'The Boy, The Mole, The Fox and The Horse' Charlie Mackesy 8.49
15 'Roald Dahl Collection 16 Books Box Set: Pakiet' Roald Dahl 34.45
16 'It Starts with Us: the highly anticipated sequel to IT ENDS WITH US' Colleen Hoover 7.00
17 'Pinch of Nom: Enjoy' Kay Allinson 10.00
18 'Verity: The thriller that will capture your heart and blow your mind' Colleen Hoover 4.50
19 'Stone Maidens' Lloyd Devereux Richards 8.99
20 'The Fast 800 Keto Recipe Book: Delicious low-carb recipes, for rapid weight loss and long-term health: The Sunday Times Bestseller (The Fast 800 Series)' Dr Clare Bailey 9.49
21 'The official highway code' Driver & Vehicle Standards Agency 4.99
22 'Follow the Money: How much does Britain cost?' Paul Johnson 19.29
23 'Lessons in Chemistry: The No. 1 Sunday Times bestseller and BBC Between the Covers Book Club pick' Bonnie Garmus 7.19
24 'What’s For Dinner?: The Sunday Times Bestseller – Fuss-free family food in 30 minutes, the first cookbook from the Taming Twins food blog' Sarah Rossi 10.00
25 'Fitwaffle's Baked In One: 100 one-tin cakes, bakes and desserts from the social media sensation' Eloise Head 11.00
26 '8 Rules of Love: From Sunday Times No.1 bestselling author Jay Shetty, a new guide on how to find lasting love and enjoy healthy relationships' Jay Shetty 10.00
27 'Where's Spidey?: A Marvel Spider-Man search & find book' Marvel Entertainment International Ltd 3.00
28 'Seven Husbands of Evelyn Hugo: The Sunday Times Bestseller' Taylor Jenkins Reid 4.50
29 'Food for Life: The New Science of Eating Well, by the #1 bestselling author of SPOON-FED' Tim Spector 10.00
30 'A Million Sloths: Super-Cute Creatures to Colour: 1 (A Million Creatures to Colour)' Lulu Mayo 4.95
31 'The Thursday Murder Club: (The Thursday Murder Club 1)' Richard Osman 1.96
32 'What I Wish People Knew About Dementia: From Someone Who Knows' Wendy Mitchell 7.99
33 'Daisy Jones and The Six: From the Sunday Times bestselling author of CARRIE SOTO IS BACK' Taylor Jenkins Reid 5.00
34 'Matilda: Special Edition' Roald Dahl 4.00
35 'Greg the Sausage Roll: Wish You Were Here: A LadBaby Book' Mark Hoyle 11.99
36 'Someone Else’s Shoes: The No 1 Sunday Times bestseller from the author of Me Before You and The Giver of Stars' Jojo Moyes 11.00
37 'Colonialism: A Moral Reckoning' Nigel Biggar 19.29
38 'You Choose Your Adventure: A World Book Day 2023 Mini Book' Pippa Goodhart 0.88
39 'Slimming Eats Made Simple: Delicious and easy recipes – 100+ under 500 calories: 2' Siobhan Wightman 10.00
40 'Lessons in Chemistry: The No. 1 Sunday Times bestseller and BBC Between the Covers Book Club pick' Bonnie Garmus 8.49
41 'Young Forever: The Secrets to Living Your Longest, Healthiest Life' Mark Hyman 14.19
42 'The Man Who Died Twice: (The Thursday Murder Club 2)' Richard Osman 4.50
43 'The Salt Path: The 85-Week Sunday Times Bestseller from the Million-Copy Bestselling Author (Raynor Winn, 1)' Raynor Winn 9.99
44 'Rich Dad Poor Dad: What the Rich Teach Their Kids About Money That the Poor and Middle Class Do Not!' Robert T. Kiyosaki 7.99
45 'How To Kill Your Family: THE #1 SUNDAY TIMES BESTSELLER' Bella Mackie 4.50
46 'The Bullet That Missed: (The Thursday Murder Club 3)' Richard Osman 10.00
47 'Time to Think: The Inside Story of the Collapse of the Tavistock’s Gender Service for Children' Hannah Barnes 16.30
48 'Me vs Brain: An Overthinker’s Guide to Life' Hayley Morris 14.99
49 'Fleishman Is in Trouble: Soon to be a major TV series starring Claire Danes & Jesse Eisenberg' Taffy Brodesser-Akner 6.99
50 'My Husband's Killer: The emotional, twisty new mystery from the #1 bestselling author of Friend Request' Laura Marshall 7.32
51 'Kay's Brilliant Brains: A World Book Day 2023 Mini Book' Adam Kay 1.00
52 'Never Never: TikTok made me buy it! The romantic thriller from BookTok sensation and Sunday Times bestselling author of It Ends with Us and New York Times bestselling author of The Wives' Colleen Hoover 4.49
53 'Minecraft Annual 2023: The best new official children’s gaming annual of 2022 – perfect for kids into video games!' Mojang AB 1.69
54 'The Body Keeps the Score: Mind, Brain and Body in the Transformation of Trauma' Bessel van der Kolk 11.95
55 'Ugly Love: a novel' Colleen Hoover 4.50
56 'Diddly Squat: ‘Til The Cows Come Home: The No 1 Sunday Times Bestseller 2022' Jeremy Clarkson 10.00
57 'Roald Dahl 15 Book Box Set Collection' Roald Dahl 35.75
58 'The Paper Palace: The No.1 New York Times Bestseller and Reese Witherspoon Bookclub Pick' Miranda Cowley Heller 5.00
59 'A Good Girl's Guide to Murder: TikTok made me buy it! The first book in the bestselling thriller trilogy, as seen in Netflix’s Heartstopper! (A Good Girl’s Guide to Murder, Book 1)' Holly Jackson 4.00
60 'Mr Men Little Miss Pancake Day: The perfect illustrated children’s book to celebrate Pancake Day! (Mr. Men and Little Miss Picture Books)' Adam Hargreaves 4.50
61 'A Million Ways to Stay on the Run: The uncut story of the international manhunt for public enemy no.1 Kenny Noye' Donal MacIntyre 6.99
62 'My Husband's Killer: The emotional, twisty new mystery from the #1 bestselling author of Friend Request' Laura Marshall 14.99
63 'The Very Hungry Caterpillar [Board Book]: Eric Carle' Eric Carle 5.03
64 'The Midnight Library: The No.1 Sunday Times bestseller and worldwide phenomenon' Matt Haig 4.50
65 'Diabetic Air Fryer Cookbook for Beginners: 1900 Days Quick, Crispy and Healthy Diabetic Friendly Recipes to Take Care of Your Well-Being without Sacrificing Taste | 28-Day Meal Plan for Balanced ... Amanda Ray 3.25
66 'Girl in Pieces: TikTok made me buy it!' Kathleen Glasgow 5.50
67 'Bluey: Where's Bluey?: A Search-and-Find Book' Bluey 4.50
68 'Otherlands: A World in the Making - A Sunday Times bestseller' Dr Thomas Halliday 9.49
69 'The Silent Patient: The record-breaking, multimillion copy Sunday Times bestselling thriller and TikTok sensation' Alex Michaelides 5.99
70 'The Chimp Paradox: The Mind Management Programme to Help You Achieve Success, Confidence and Happiness: The Acclaimed Mind Management Programme to Help You Achieve Success, Confidence and Happiness' Professor Steve Peters 12.49
71 'Dolly’s Dream: The compelling and heartwarming new novel for 2023 from the No.1 Sunday Times bestseller: Book 6 (The Rockwood Chronicles)' Dilly Court 4.49
72 'Manifest: The Sunday Times Bestseller' Roxie Nafousi 13.00
73 'One: Simple One-Pan Wonders' Jamie Oliver 13.00
74 'The Murders at Fleat House: Lucinda Riley' Lucinda Riley 4.50
75 'Icebreaker' Hannah Grace 4.50
76 'No More Nappies: A Potty-Training Book (Campbell Big Steps, 2)' Campbell Books 5.99
77 'Still Life: The instant Sunday Times bestseller and BBC Between the Covers Book Club pick' Sarah Winman 7.49
78 'The 8-Week Blood Sugar Diet: Lose weight fast and reprogramme your body (The Fast 800 series)' Michael Mosley 6.49
79 'Zog' Julia Donaldson 4.00
80 'Good Girl, Bad Blood - The Sunday Times Bestseller: TikTok made me buy it! The Sunday Times Bestseller and sequel to A Good Girl's Guide to Murder: Book 2' Holly Jackson 4.00
81 'Closer to Love: How to Attract the Right Relationships and Deepen Your Connections' Vex King 11.99
82 'First Dinosaur Encyclopedia: A First Reference Book for Children (DK First Reference)' DK 4.99
83 'The Dead of Winter: The chilling new thriller from the No. 1 Sunday Times bestselling author of the Logan McRae series' Stuart MacBride 10.00
84 'RecipeTin Eats: Dinner: 150 recipes from Australia's favourite cook' Nagi Maehashi 20.00
85 'The Garnett Girls: The sweeping new debut novel and family drama of 2023 that everyone is falling in love with, for fans of Taylor Jenkins Reid' Georgina Moore 7.49
86 'Air-Fryer Cookbook (THE SUNDAY TIMES BESTSELLER): Quick, healthy and delicious recipes for beginners' Jenny Tschiesche 10.98
87 'The 2023 UK Air Fryer Cookbook for Beginners: Delicious, Healthy & Budget-Friendly Recipes Including Dinners, Sides, Snacks, Lunches & More (European Measurements & UK Ingredients)' Victoria Anderson 4.99
88 'The 6:20 Man' David Baldacci 4.49
89 'Guinness World Records 2023' Guinness World Records 5.00
90 'A Path through the Jungle: Psychological Health and Wellbeing Programme to Develop Robustness and Resilience (A Psychological Health and Wellbeing Programme to Develop Robustness and Resilience)' Professor Steve Peters 9.99
91 'The Hairy Bikers Eat to Beat Type 2 Diabetes: 80 delicious & filling recipes to get your health back on track' Hairy Bikers 8.00
92 'Billy's Bravery: A brand new Big Bright Feelings picture book exclusive for World Book Day' Tom Percival 1.00
93 'Ninja Dual Zone Air Fryer Cookbook for Beginners: 1600 Days Super-easy, Yummy and Affordable Home Cook Recipes for Beginners and Advanced Users' Cerys Ingram 11.68
94 'Diddly Squat: The No 1 Sunday Times Bestseller' Jeremy Clarkson 6.99
95 'The Smeds and the Smoos' Julia Donaldson 4.00
96 'Failure: What Jesus Said About Sin, Mistakes and Messing Stuff Up: The Archbishop of Canterbury's Lent Book 2023' Rt Revd Dr Emma Ineson 10.11
97 'DIRTY LAUNDRY: Why adults with ADHD are so ashamed and what we can do to help' Mr Richard Pink 13.99
98 '12 Rules for Life: An Antidote to Chaos' Jordan B. Peterson 10.11
99 'Easy Peasy Puppy Squeezy: The UK's No.1 Dog Training Book' Steve Mann 4.00
In [ ]:
 
In [ ]: