Web Scraping word lists with Python

Quick little project to scrape a long word list from a paginated web site.

import random
import time

import requests
from bs4 import BeautifulSoup

x = range(165, 175)

for n in x:
    delay = random.uniform(0.3, 2.5)
    print(f"Starting page {n} in {delay} seconds.")

    time.sleep(delay)
    # URL of the web page to scrape
    url = f"https://www.[wordlistdomain].com/words?page={n}"  # Replace with the actual URL

    # Send an HTTP request to fetch the page content
    response = requests.get(url)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the unordered list with ID "words-list"
    word_list = soup.find("ul", id="words-list")

    # Extract the names of list items (li) from inside the word list
    list_items = word_list.find_all("li")

    # Create or open the text file for appending
    with open("word_list.txt", "a") as file:
        for item in list_items:
            # Extract the text content of the <a> tag inside each list item
            word = item.find("a").get_text()
            # Append the word to the text file
            file.write(word + "\n")

print("Words appended to word_list.txt")

Comments

Filtered HTML

  • Web page addresses and email addresses turn into links automatically.
  • Allowed HTML tags: <a href hreflang> <em> <strong> <cite> <blockquote cite> <code> <ul type> <ol start type> <li> <dl> <dt> <dd>
  • Lines and paragraphs break automatically.