HEX
Server: Apache/2.4.52 (Ubuntu)
System: Linux spn-python 5.15.0-89-generic #99-Ubuntu SMP Mon Oct 30 20:42:41 UTC 2023 x86_64
User: arjun (1000)
PHP: 8.1.2-1ubuntu2.20
Disabled: NONE
Upload Files
File: /var/www/html/calendar-planning/app/utils/scrap_website_content.py
import requests
from bs4 import BeautifulSoup
import re

def extract_text_from_url(url:str) -> dict:
    # Step 1: Fetch the webpage content
    response = requests.get(url)
    if response.status_code != 200:
        return {
            "statusCode" : response.status_code,
            "url" : url
            }

    # Step 2: Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Step 3: Extract text from the entire document
    text = soup.get_text(separator='\n')
    
    return {
            "statusCode" : response.status_code,
            "url" : url,
            "content" : re.sub(r'\s+', ' ', text.replace('\n', ' '))
            }