In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import csv
import pandas as pd
With this script we extract the 100 Best Sellers from amazon.co.uk(title,author,price)¶
Attention!!!As of 19/2/2023 this script works like a charm.There is no quarantee that it will work tomorrow.Websites constantly change and even a small change in the structure of the website can break the script.The good news is that many changes to websites are small and incremental, so we’ll likely be able to update our scraper with only minimal adjustments.¶
In [3]:
##### Web scrapper that imitates page scrolling for lazy loading pages #####
service = Service(r"C:\Users\cbots\Downloads\MY PORTFOLIO\Amazon Web Scraper Projects\Scraping 100 Best Sellers\chromedriver.exe")
driver = webdriver.Chrome(service=service)
driver.get("https://www.amazon.co.uk/gp/bestsellers/books/ref=pd_zg_ts_books")
time.sleep(2) # Allow 2 seconds for the web page to open
#### FIRST PAGE ####
# Scroll down the first page(this part of the code imitates the scrolling of the mouse)
scroll_pause_time = 1 # the code scrolls down the page with a 1 second delay between scrolls so that the page loads properly
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height after every scroll, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if screen_height * i > scroll_height:
break
# Scrape the data from the first page
soup = BeautifulSoup(driver.page_source, "html.parser")
authors1 = soup.findAll(class_=["a-size-small a-link-child", "a-size-small a-color-base"])
page1_authors = [author.get_text(strip=True) for author in authors1]
prices1 = soup.findAll(class_=["_cDEzb_p13n-sc-price_3mJ9Z", "p13n-sc-price"])
page1_prices = [price.get_text(strip=True) for price in prices1]
aTags = soup.find_all('a', {'class': 'a-link-normal'})
page1_titles = []
for tag in aTags:
# Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'
divTags1 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'})
for tag in divTags1:
page1_titles.append(tag.get_text(strip=True))
# Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'
divTags2 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'})
for tag in divTags2:
page1_titles.append(tag.get_text(strip=True))
# Combine book titles,authors and prices
if len(page1_titles) == len(page1_authors) == len(page1_prices):
with open('titles_authors_prices.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
for i in range(len(page1_titles)):
price = page1_prices[i].replace('£', '', 1).strip()
writer.writerow([f"'{page1_titles[i]}'", page1_authors[i], price])
else:
print('Error: The number of book titles does not match the number of book authors')
# Pagination-check if there is a Next Page button on the first page
next_button = driver.find_elements(By.CLASS_NAME, "a-last")
if next_button:
# If there is a next page button, click it
next_button[0].click()
time.sleep(3) # Wait for the second page to load
#### SECOND PAGE ####
# Scroll down the second page(this part of the code imitates the scrolling of the mouse)
scroll_pause_time = 1 # the code scrolls down the page with a 1 second delay between scrolls so that the page loads properly
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height after every scroll, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if screen_height * i > scroll_height:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
authors2 = soup.findAll(class_=["a-size-small a-link-child", "a-size-small a-color-base"])
page2_authors = [author.get_text(strip=True) for author in authors2]
prices2 = soup.findAll(class_=["_cDEzb_p13n-sc-price_3mJ9Z","p13n-sc-price"])
page2_prices = [price.get_text(strip=True) for price in prices2]
aTags = soup.find_all('a', {'class': 'a-link-normal'})
page2_titles = []
for tag in aTags:
# Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'
divTags3 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-1_1Fn1y'})
for tag in divTags3:
page2_titles.append(tag.get_text(strip=True))
# Extract text from div tags with class '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'
divTags4 = tag.find_all('div', {'class': '_cDEzb_p13n-sc-css-line-clamp-2_EWgCb'})
for tag in divTags4:
page2_titles.append(tag.get_text(strip=True))
# Combine book titles,authors and prices
if len(page2_titles) == len(page2_authors) == len(page2_prices):
with open('titles_authors_prices2.csv', 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
for i in range(len(page2_titles)):
price = page2_prices[i].replace('£', '', 1).strip()
writer.writerow([f"'{page2_titles[i]}'", page2_authors[i], price])
else:
print('Error: The number of book titles does not match the number of book authors')
# read the data from the first file
df1 = pd.read_csv('titles_authors_prices.csv', delimiter=',', header=None, names=['Book Title', 'Author', 'Price'])
# read the data from the second file
df2 = pd.read_csv('titles_authors_prices2.csv', delimiter=',', header=None, names=['Book Title', 'Author', 'Price'])
# combine the dataframes into one
df = pd.concat([df1, df2], ignore_index=True)
# write the combined data to a CSV file
df.to_csv('best100_titles_authors_prices.csv', index=False)
driver.quit()
In [4]:
pd.set_option('display.max_rows', None) #display all rows
In [5]:
pd.options.display.max_colwidth=200 # change Book Title column width to fit the whole title
In [6]:
df
Out[6]:
| Book Title | Author | Price | |
|---|---|---|---|
| 0 | 'Bored of Lunch: The Healthy Air Fryer Book' | Nathan Anthony | 9.49 |
| 1 | 'Bored of Lunch: The Healthy Slow Cooker Book: THE NUMBER ONE BESTSELLER' | Nathan Anthony | 9.00 |
| 2 | 'Strong Female Character' | Fern Brady | 15.63 |
| 3 | 'The World According to Kaleb: THE SUNDAY TIMES BESTSELLER - worldly wisdom from the breakout star of Clarkson’s Farm' | Kaleb Cooper | 9.00 |
| 4 | 'It Ends With Us: The emotional #1 Sunday Times bestseller (Lily & Atlas, 1)' | Colleen Hoover | 4.50 |
| 5 | 'Atomic Habits: the life-changing million-copy #1 bestseller' | James Clear | 9.00 |
| 6 | 'The Ultimate Air Fryer Cookbook: Quick, healthy, energy-saving recipes using UK measurements' | Clare Andrews | 10.00 |
| 7 | 'Thrown: THE SUNDAY TIMES BESTSELLING novel of friendship, heartbreak and pottery for beginners' | Sara Cox | 6.74 |
| 8 | 'Dr Rupy Cooks: Over 100 easy, healthy, flavourful recipes' | Dr Rupy Aujla | 11.00 |
| 9 | 'The Keeper of Stories: The most charming and uplifting novel you will read this year!' | Sally Page | 3.00 |
| 10 | 'Freezing Order: Vladimir Putin, Russian Money Laundering and Murder - A True Story' | Bill Browder | 7.99 |
| 11 | 'Why Has Nobody Told Me This Before?: The No 1 Sunday Times Bestseller 2022' | Dr Julie Smith | 8.00 |
| 12 | 'Spare: by Prince Harry, The Duke of Sussex' | Prince Harry The Duke of Sussex | 14.00 |
| 13 | 'Menopausing: The Sunday Times bestselling self-help guide for 2022 to help you care for yourself, cope with symptoms, and live your best life during menopause' | Davina McCall | 11.00 |
| 14 | 'The Boy, The Mole, The Fox and The Horse' | Charlie Mackesy | 8.49 |
| 15 | 'Roald Dahl Collection 16 Books Box Set: Pakiet' | Roald Dahl | 34.45 |
| 16 | 'It Starts with Us: the highly anticipated sequel to IT ENDS WITH US' | Colleen Hoover | 7.00 |
| 17 | 'Pinch of Nom: Enjoy' | Kay Allinson | 10.00 |
| 18 | 'Verity: The thriller that will capture your heart and blow your mind' | Colleen Hoover | 4.50 |
| 19 | 'Stone Maidens' | Lloyd Devereux Richards | 8.99 |
| 20 | 'The Fast 800 Keto Recipe Book: Delicious low-carb recipes, for rapid weight loss and long-term health: The Sunday Times Bestseller (The Fast 800 Series)' | Dr Clare Bailey | 9.49 |
| 21 | 'The official highway code' | Driver & Vehicle Standards Agency | 4.99 |
| 22 | 'Follow the Money: How much does Britain cost?' | Paul Johnson | 19.29 |
| 23 | 'Lessons in Chemistry: The No. 1 Sunday Times bestseller and BBC Between the Covers Book Club pick' | Bonnie Garmus | 7.19 |
| 24 | 'What’s For Dinner?: The Sunday Times Bestseller – Fuss-free family food in 30 minutes, the first cookbook from the Taming Twins food blog' | Sarah Rossi | 10.00 |
| 25 | 'Fitwaffle's Baked In One: 100 one-tin cakes, bakes and desserts from the social media sensation' | Eloise Head | 11.00 |
| 26 | '8 Rules of Love: From Sunday Times No.1 bestselling author Jay Shetty, a new guide on how to find lasting love and enjoy healthy relationships' | Jay Shetty | 10.00 |
| 27 | 'Where's Spidey?: A Marvel Spider-Man search & find book' | Marvel Entertainment International Ltd | 3.00 |
| 28 | 'Seven Husbands of Evelyn Hugo: The Sunday Times Bestseller' | Taylor Jenkins Reid | 4.50 |
| 29 | 'Food for Life: The New Science of Eating Well, by the #1 bestselling author of SPOON-FED' | Tim Spector | 10.00 |
| 30 | 'A Million Sloths: Super-Cute Creatures to Colour: 1 (A Million Creatures to Colour)' | Lulu Mayo | 4.95 |
| 31 | 'The Thursday Murder Club: (The Thursday Murder Club 1)' | Richard Osman | 1.96 |
| 32 | 'What I Wish People Knew About Dementia: From Someone Who Knows' | Wendy Mitchell | 7.99 |
| 33 | 'Daisy Jones and The Six: From the Sunday Times bestselling author of CARRIE SOTO IS BACK' | Taylor Jenkins Reid | 5.00 |
| 34 | 'Matilda: Special Edition' | Roald Dahl | 4.00 |
| 35 | 'Greg the Sausage Roll: Wish You Were Here: A LadBaby Book' | Mark Hoyle | 11.99 |
| 36 | 'Someone Else’s Shoes: The No 1 Sunday Times bestseller from the author of Me Before You and The Giver of Stars' | Jojo Moyes | 11.00 |
| 37 | 'Colonialism: A Moral Reckoning' | Nigel Biggar | 19.29 |
| 38 | 'You Choose Your Adventure: A World Book Day 2023 Mini Book' | Pippa Goodhart | 0.88 |
| 39 | 'Slimming Eats Made Simple: Delicious and easy recipes – 100+ under 500 calories: 2' | Siobhan Wightman | 10.00 |
| 40 | 'Lessons in Chemistry: The No. 1 Sunday Times bestseller and BBC Between the Covers Book Club pick' | Bonnie Garmus | 8.49 |
| 41 | 'Young Forever: The Secrets to Living Your Longest, Healthiest Life' | Mark Hyman | 14.19 |
| 42 | 'The Man Who Died Twice: (The Thursday Murder Club 2)' | Richard Osman | 4.50 |
| 43 | 'The Salt Path: The 85-Week Sunday Times Bestseller from the Million-Copy Bestselling Author (Raynor Winn, 1)' | Raynor Winn | 9.99 |
| 44 | 'Rich Dad Poor Dad: What the Rich Teach Their Kids About Money That the Poor and Middle Class Do Not!' | Robert T. Kiyosaki | 7.99 |
| 45 | 'How To Kill Your Family: THE #1 SUNDAY TIMES BESTSELLER' | Bella Mackie | 4.50 |
| 46 | 'The Bullet That Missed: (The Thursday Murder Club 3)' | Richard Osman | 10.00 |
| 47 | 'Time to Think: The Inside Story of the Collapse of the Tavistock’s Gender Service for Children' | Hannah Barnes | 16.30 |
| 48 | 'Me vs Brain: An Overthinker’s Guide to Life' | Hayley Morris | 14.99 |
| 49 | 'Fleishman Is in Trouble: Soon to be a major TV series starring Claire Danes & Jesse Eisenberg' | Taffy Brodesser-Akner | 6.99 |
| 50 | 'My Husband's Killer: The emotional, twisty new mystery from the #1 bestselling author of Friend Request' | Laura Marshall | 7.32 |
| 51 | 'Kay's Brilliant Brains: A World Book Day 2023 Mini Book' | Adam Kay | 1.00 |
| 52 | 'Never Never: TikTok made me buy it! The romantic thriller from BookTok sensation and Sunday Times bestselling author of It Ends with Us and New York Times bestselling author of The Wives' | Colleen Hoover | 4.49 |
| 53 | 'Minecraft Annual 2023: The best new official children’s gaming annual of 2022 – perfect for kids into video games!' | Mojang AB | 1.69 |
| 54 | 'The Body Keeps the Score: Mind, Brain and Body in the Transformation of Trauma' | Bessel van der Kolk | 11.95 |
| 55 | 'Ugly Love: a novel' | Colleen Hoover | 4.50 |
| 56 | 'Diddly Squat: ‘Til The Cows Come Home: The No 1 Sunday Times Bestseller 2022' | Jeremy Clarkson | 10.00 |
| 57 | 'Roald Dahl 15 Book Box Set Collection' | Roald Dahl | 35.75 |
| 58 | 'The Paper Palace: The No.1 New York Times Bestseller and Reese Witherspoon Bookclub Pick' | Miranda Cowley Heller | 5.00 |
| 59 | 'A Good Girl's Guide to Murder: TikTok made me buy it! The first book in the bestselling thriller trilogy, as seen in Netflix’s Heartstopper! (A Good Girl’s Guide to Murder, Book 1)' | Holly Jackson | 4.00 |
| 60 | 'Mr Men Little Miss Pancake Day: The perfect illustrated children’s book to celebrate Pancake Day! (Mr. Men and Little Miss Picture Books)' | Adam Hargreaves | 4.50 |
| 61 | 'A Million Ways to Stay on the Run: The uncut story of the international manhunt for public enemy no.1 Kenny Noye' | Donal MacIntyre | 6.99 |
| 62 | 'My Husband's Killer: The emotional, twisty new mystery from the #1 bestselling author of Friend Request' | Laura Marshall | 14.99 |
| 63 | 'The Very Hungry Caterpillar [Board Book]: Eric Carle' | Eric Carle | 5.03 |
| 64 | 'The Midnight Library: The No.1 Sunday Times bestseller and worldwide phenomenon' | Matt Haig | 4.50 |
| 65 | 'Diabetic Air Fryer Cookbook for Beginners: 1900 Days Quick, Crispy and Healthy Diabetic Friendly Recipes to Take Care of Your Well-Being without Sacrificing Taste | 28-Day Meal Plan for Balanced ... | Amanda Ray | 3.25 |
| 66 | 'Girl in Pieces: TikTok made me buy it!' | Kathleen Glasgow | 5.50 |
| 67 | 'Bluey: Where's Bluey?: A Search-and-Find Book' | Bluey | 4.50 |
| 68 | 'Otherlands: A World in the Making - A Sunday Times bestseller' | Dr Thomas Halliday | 9.49 |
| 69 | 'The Silent Patient: The record-breaking, multimillion copy Sunday Times bestselling thriller and TikTok sensation' | Alex Michaelides | 5.99 |
| 70 | 'The Chimp Paradox: The Mind Management Programme to Help You Achieve Success, Confidence and Happiness: The Acclaimed Mind Management Programme to Help You Achieve Success, Confidence and Happiness' | Professor Steve Peters | 12.49 |
| 71 | 'Dolly’s Dream: The compelling and heartwarming new novel for 2023 from the No.1 Sunday Times bestseller: Book 6 (The Rockwood Chronicles)' | Dilly Court | 4.49 |
| 72 | 'Manifest: The Sunday Times Bestseller' | Roxie Nafousi | 13.00 |
| 73 | 'One: Simple One-Pan Wonders' | Jamie Oliver | 13.00 |
| 74 | 'The Murders at Fleat House: Lucinda Riley' | Lucinda Riley | 4.50 |
| 75 | 'Icebreaker' | Hannah Grace | 4.50 |
| 76 | 'No More Nappies: A Potty-Training Book (Campbell Big Steps, 2)' | Campbell Books | 5.99 |
| 77 | 'Still Life: The instant Sunday Times bestseller and BBC Between the Covers Book Club pick' | Sarah Winman | 7.49 |
| 78 | 'The 8-Week Blood Sugar Diet: Lose weight fast and reprogramme your body (The Fast 800 series)' | Michael Mosley | 6.49 |
| 79 | 'Zog' | Julia Donaldson | 4.00 |
| 80 | 'Good Girl, Bad Blood - The Sunday Times Bestseller: TikTok made me buy it! The Sunday Times Bestseller and sequel to A Good Girl's Guide to Murder: Book 2' | Holly Jackson | 4.00 |
| 81 | 'Closer to Love: How to Attract the Right Relationships and Deepen Your Connections' | Vex King | 11.99 |
| 82 | 'First Dinosaur Encyclopedia: A First Reference Book for Children (DK First Reference)' | DK | 4.99 |
| 83 | 'The Dead of Winter: The chilling new thriller from the No. 1 Sunday Times bestselling author of the Logan McRae series' | Stuart MacBride | 10.00 |
| 84 | 'RecipeTin Eats: Dinner: 150 recipes from Australia's favourite cook' | Nagi Maehashi | 20.00 |
| 85 | 'The Garnett Girls: The sweeping new debut novel and family drama of 2023 that everyone is falling in love with, for fans of Taylor Jenkins Reid' | Georgina Moore | 7.49 |
| 86 | 'Air-Fryer Cookbook (THE SUNDAY TIMES BESTSELLER): Quick, healthy and delicious recipes for beginners' | Jenny Tschiesche | 10.98 |
| 87 | 'The 2023 UK Air Fryer Cookbook for Beginners: Delicious, Healthy & Budget-Friendly Recipes Including Dinners, Sides, Snacks, Lunches & More (European Measurements & UK Ingredients)' | Victoria Anderson | 4.99 |
| 88 | 'The 6:20 Man' | David Baldacci | 4.49 |
| 89 | 'Guinness World Records 2023' | Guinness World Records | 5.00 |
| 90 | 'A Path through the Jungle: Psychological Health and Wellbeing Programme to Develop Robustness and Resilience (A Psychological Health and Wellbeing Programme to Develop Robustness and Resilience)' | Professor Steve Peters | 9.99 |
| 91 | 'The Hairy Bikers Eat to Beat Type 2 Diabetes: 80 delicious & filling recipes to get your health back on track' | Hairy Bikers | 8.00 |
| 92 | 'Billy's Bravery: A brand new Big Bright Feelings picture book exclusive for World Book Day' | Tom Percival | 1.00 |
| 93 | 'Ninja Dual Zone Air Fryer Cookbook for Beginners: 1600 Days Super-easy, Yummy and Affordable Home Cook Recipes for Beginners and Advanced Users' | Cerys Ingram | 11.68 |
| 94 | 'Diddly Squat: The No 1 Sunday Times Bestseller' | Jeremy Clarkson | 6.99 |
| 95 | 'The Smeds and the Smoos' | Julia Donaldson | 4.00 |
| 96 | 'Failure: What Jesus Said About Sin, Mistakes and Messing Stuff Up: The Archbishop of Canterbury's Lent Book 2023' | Rt Revd Dr Emma Ineson | 10.11 |
| 97 | 'DIRTY LAUNDRY: Why adults with ADHD are so ashamed and what we can do to help' | Mr Richard Pink | 13.99 |
| 98 | '12 Rules for Life: An Antidote to Chaos' | Jordan B. Peterson | 10.11 |
| 99 | 'Easy Peasy Puppy Squeezy: The UK's No.1 Dog Training Book' | Steve Mann | 4.00 |
In [ ]:
In [ ]: