Skip to content
Snippets Groups Projects
Commit 0385ec8b authored by Esteban Prince, Liam (UG - Computer Science)'s avatar Esteban Prince, Liam (UG - Computer Science)
Browse files

Replace scraper.py

parent 3857d470
No related branches found
No related tags found
No related merge requests found
# Install go
# sudo dnf insrall go
# Install goquery
# go get goquery
# https://github.com/Thomas-Rudge/BBC-Recipe-Web-Scraper/blob/master/scrape_bbc_recipes.py
import json
from recipe_scrapers import scrape_me
from usp.tree import sitemap_tree_for_homepage
import urllib.request
import requests
from bs4 import BeautifulSoup
import json
def scrapeIngredients():
python3 scrapers/ingredients.py > database/ingredients.json
urls = ["https://www.bbc.co.uk/food/ingredients/a-z/a/1",
"https://www.bbc.co.uk/food/ingredients/a-z/a/2",
......@@ -69,54 +65,65 @@ def scrapeIngredients():
"https://www.bbc.co.uk/food/ingredients/a-z/z/1"
]
ingredientsList = []
for url in urls:
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
ingredients = soup.select("h3")
for ingredient in ingredients:
ingredientsList.append(ingredient.text.strip())
# add source bbc.co.uk/food/ingredients to schema
with open('database/ingredients.json', 'w') as i:
json.dump(ingredientsList, i)
def scrapeBBCGoodFood():
sitemap = "https://www.bbcgoodfood.com/sitemap.xml"
prefix = "http://bbcgoodfood.com/recipes"
url = parseSitemap(sitemap):
# this part is bash
go run main.go url >> recipes/bbcgoodfood.json
def scrapeBBCFood():
sitemap = "https://www.bbc.co.uk/food/sitemap.xml"
prefix = "http://bbc.co.uk/food/recipes"
url = parseSitemap(sitemap):
# this part is bash
go run main.go url >> recipes/bbcfood.json
def scrapeEpicurious():
sitemap = "https://www.epicurious.com/services/sitemap/recipes/editorial/2020"
url = parseSitemap(sitemap):
def scrapeBonAppetit():
# Create a list of all recipes to scrape by parsing sitemap
def parseSitemap(sitemap):
response = urllib.request.urlopen(sitemap)
html = response.read()
sitemap = bs4.BeautifulSoup(page.text, 'xml')
for line in sitemap.find_all('loc'):
for string in line.stripped_strings:
if string.startswith(prefix):
urls.append(string)
return urls
def urlList(sitemap):
# get sitemap with https://stackoverflow.com/questions/10232774/how-to-find-sitemap-xml-path-on-websites
soup = BeautifulSoup(requests.get(sitemap), 'xml')
urls = [url for url in soup.find_all("loc", text=True) if url.contains("recipe")]
websites = [
"http://101cookbooks.com/",
"http://allrecipes.com/",
"http://bbc.com/",
"http://bbc.co.uk/",
"http://bbcgoodfood.com/",
"http://bettycrocker.com/",
"http://bonappetit.com/",
"https://www.budgetbytes.com/",
"http://closetcooking.com/",
"http://cookstr.com/",
"http://copykat.com/",
"https://en.wikibooks.org/",
"http://epicurious.com/",
"http://finedininglovers.com/",
"https://food.com/",
"http://foodnetwork.com/",
"http://foodrepublic.com/",
"https://geniuskitchen.com/",
"https://greatbritishchefs.com/",
"http://giallozafferano.it/",
"http://gonnawantseconds.com/",
"https://healthyeating.nhlbi.nih.gov/",
"https://heinzbrasil.com.br/",
"https://www.hellofresh.com/",
"https://www.hellofresh.co.uk/",
"https://receitas.ig.com.br/",
"https://inspiralized.com/",
"http://jamieoliver.com/",
"https://www.thekitchn.com/",
"https://www.matprat.no/",
"http://mybakingaddiction.com/",
"https://panelinha.com.br/",
"http://paninihappy.com/",
"http://realsimple.com/",
"https://www.seriouseats.com/",
"http://simplyrecipes.com/",
"https://www.southernliving.com/",
"http://steamykitchen.com/",
"https://www.tastesoflizzyt.com",
"http://tastykitchen.com/",
"http://thepioneerwoman.com/",
"https://www.thespruceeats.com/",
"http://thehappyfoodie.co.uk/",
"http://thevintagemixer.com/",
"http://tine.no/",
"http://twopeasandtheirpod.com/",
"http://whatsgabycooking.com/",
"http://yummly.com/"
]
urlList("https://bbc.co.uk/food/sitemap.xml")
for website in websites:
for url in urlList(website):
#with open file as # json pickle
print(scrape_me(url).title())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment