Source code scraping

# Importing basic libraries
import os, requests, re, bs4, mysql.connector
# Import urllib for downloads
from urllib import request, parse
# Pillow used only to change png to jpg
from PIL import Image

# Should use the database?
use_db = True

# Basic constants
base_url = "https://nfsworld.fandom.com"
args_url = "/wiki/Cars"
# 3 brands with a composed name
composed_brands = ["Alfa Romeo", "Aston Martin", "Ford Shelby"]

# Used for stats
all_colors = []
all_cars = []
all_brands = []

# Create directories for downloads
os.makedirs("img", exist_ok=True)
os.makedirs("img/cars", exist_ok=True)
os.makedirs("img/constructors", exist_ok=True)

# Get main page
request_data = requests.get(base_url + args_url)
html_page = bs4.BeautifulSoup(request_data.content, "html.parser")
page_content = html_page.find("div", class_="mw-content-ltr")
# Get all cars on the main page
car_list = page_content.find_all("li")
# car_list = [car_list[125]] # Testing only the selected car

if use_db:
    # Connect to local !info2_cars must be created before!
    db = mysql.connector.connect(host="localhost", user="root", password="", database="info2_cars")
    cursor = db.cursor()

# In case some infos missing, shift the index
shift = 0
# Car id counter (in case AI doesn't work)
car_id = 0
# Fetch from the main page direcly !abandoned!
"""
for car in car_list:
    car_id += 1

    car_titles = car.find_all("a", title=True)
    all_cars.append(car_titles[-1]["title"])
    car_name = car_titles[-1]["title"]
    if any([x for x in composed_brands if x in car_name]):
        car_brand = car_name.split()[0]
    else:
        car_brand = car_name.split()[0]
    del car_titles[-1]

    for color in car_titles:
        all_colors.append(color["title"])
        car_color = color["title"]
        print(car_color)

    print(car_id, car_name, car_brand)
"""

# Fetch from every car link
for car in car_list:
    href_car = car.find_all("a", href=True)[-1]["href"]

    # Car request
    request_car = requests.get(base_url + href_car)
    html_car_page = bs4.BeautifulSoup(request_car.content, "html.parser")

    # Car id counter for index (or if AI doesn't work)
    car_id += 1
    car_name = html_car_page.find("h1", class_="page-header__title").text.replace("/", "-")
    car_base_name = car_name.replace(" ", "_")
    print("Processing " + car_name)
    all_cars.append(car_name)
    # Specific car with no information
    if car_name == "Speed Rabbit SUV":
        car_id -= 1
        continue
    # Windows folder fail if dot at the end
    if car_name.endswith("."):
        car_name = car_name[:-1]
    # Register the picture as jpg to download
    if ".png" in car_name:
        car_name.replace(".png", ".jpg")

    # Get brand name if composed
    if any([x for x in composed_brands if x in car_name]):
        car_brand_name = " ".join(car_name.split()[:2])
    else:
        car_brand_name = car_name.split()[0]
    # Specific car in game (registered as NFSW brand)
    if car_brand_name == "Battlefield":
        car_brand_name = "NFSW"
    # Append to all cars list
    if not car_brand_name in all_brands:
        all_brands.append(car_brand_name)
        brand_exists = True
    else:
        brand_exists = False
    # Get brand id
    car_brand_id = len(all_brands)

    # Get car description parsing all elements after comments
    car_desc_elements = html_car_page.find("div", id="mw-content-text").contents[3:]
    car_desc = ""
    for x in car_desc_elements:
        if x.name == "nav":
            break
        else:
            if x.name != "table" and x != "\n":
                try:
                    car_desc += x.text
                except:
                    car_desc += x
    car_desc.strip() # Remove first space and break lines

    # Get car pic using static img width
    car_pic_raw = html_car_page.find_all("img", width="250")[-1]
    car_pic_link = car_pic_raw["src"].split("/revision/")[0]
    car_pic = parse.unquote(car_pic_raw["data-image-key"])
    # Get dates below images (small tag)
    if len(html_car_page.find_all("small")) <= 2:
        # Shift by 2 with a car to get right values
        shift = 2
        # Dates correspond to the Battlefield Heroes SUV only
        car_date_start = "2012"
        car_date_end = "2015"
    else:
        shift = 0
        car_date_start = re.findall(r"
(\d+)", str(html_car_page.find("small")))[0]
        car_date_end = re.findall(r"[-|>]\s?(\w+)([0-9]{3})", str(car_style_data).strip())
        car_style_topspeed = car_style_perf[0]
        car_style_acceleration = car_style_perf[1]
        car_style_handling = car_style_perf[2]

        # Inexistant or missing prices, checking manually using min cash cost
        if str(car_price).find(",") != -1:
            a = [int(x.text.replace(",", "")) for x in car_price]
            if len(a) == 1:
                if a[0] < 62500:
                    car_style_price = None
                    car_style_price_sb = a[0]
                else:
                    car_style_price = a[0]
                    car_style_price_sb = None
            else:
                if a[0] < 62500:
                    car_style_price = a[1]
                    car_style_price_sb = a[0]
                else:
                    car_style_price = a[0]
                    car_style_price_sb = a[1]
        else:
            car_style_price = None
            car_style_price_sb = None

        # Download car pic edition
        if car_style_pic:
            request.urlretrieve(car_style_pic_link, f"img/cars/{car_base_name}/{car_style_pic}")
            if ".png" in car_style_pic:
                img = Image.open(f"img/cars/{car_base_name}/{car_style_pic}")
                img.save(f"img/cars/{car_base_name}/{car_style_pic[:-4]}.jpg")
                os.remove(f"img/cars/{car_base_name}/{car_style_pic}")

        print(car_id, car_style_name, car_style_pic, car_style_class, car_style_overall, car_style_topspeed, car_style_acceleration, car_style_handling, car_style_price, car_style_price_sb)
        sql = "INSERT INTO editions (car_id, style_name, style_pic, style_class, style_overall, style_topspeed, style_acceleration, style_handling, style_price, style_price_sb) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
        val = (car_id, car_style_name, car_style_pic, car_style_class, car_style_overall, car_style_topspeed, car_style_acceleration, car_style_handling, car_style_price, car_style_price_sb)
        if use_db:
            cursor.execute(sql, val)
            db.commit()

if use_db:
    # db.commit() # In case last one is skipped
    db.close()
    print("Done")