File: /var/www/html/calendar-planning/app/utils/scrap_website_content.py
import requests
from bs4 import BeautifulSoup
import re
def extract_text_from_url(url:str) -> dict:
# Step 1: Fetch the webpage content
response = requests.get(url)
if response.status_code != 200:
return {
"statusCode" : response.status_code,
"url" : url
}
# Step 2: Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Step 3: Extract text from the entire document
text = soup.get_text(separator='\n')
return {
"statusCode" : response.status_code,
"url" : url,
"content" : re.sub(r'\s+', ' ', text.replace('\n', ' '))
}