Python - Web Scrapping MeetUp

Web Scrapping MeetUp

Publicado por mariona (17 intervenciones) el 19/12/2023 09:05:48

Hola, estoy intentando scrappear los eventos de MeetUp para Madrid y exportarlos en un csv con las siguientes variables: .image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories

Tengo el siguiente código pero no me funciona, si me pudierais ayudar por favor que no soy programadora y no sé muy bien cómo debo hacerlo. Gracias de antemano.

No sé si el problema radica en la paginación o es que tengo algo mal definido en el código. Si me pudierais ayudar os lo agradecería, pues lo necesito para un proyecto.

from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
import time # Import the time module
import random

# URL of the page with meetup events in Madrid
base_url = "https://www.meetup.com/es-ES/find/?location=es--Madrid&source=EVENTS&eventType=inPerson"

# Limit of events you want to collect
event_limit = 100

# Directory to store the event images
image_dir = "event_images"
os.makedirs(image_dir, exist_ok=True)

# List to store event data
data = []

# Initialize a Selenium web driver
driver = webdriver.Chrome() # You need to have Chrome and chromedriver installed

# Function to get event details including the image
def get_event_details(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Introduce a 0.5-second delay before retrieving the map link
time.sleep(1 + random.uniform(0, 1))

# Find the HTML element containing the event image from within the event page
image_element = soup.find("img", {"alt": True, "data-nimg": True})

# Extract the image URL
image_url = image_element['src'] if image_element else "Image URL not available"

description_element = soup.find("div", {"class": "break-words"})
description = description_element.find_all("p")
description_text = "\n".join([p.text.strip() for p in description])

location_name_element = soup.select_one("a[data-testid='venue-name-link']").text if soup.select_one(
"a[data-testid='venue-name-link']") else "Information not available"

location_info_element = soup.find("div", {"class": "text-gray6", "data-testid": "location-info"})
location_info = location_info_element.text.strip() if location_info_element else "Information not available"

map_link_element = soup.find("a", {"data-testid": "map-link"})['href'] if soup.find("a", {
"data-testid": "map-link"}) else "Map link not available"

categories = get_event_categories(event_url)

return image_url, description_text, location_name_element, location_info, map_link_element, categories

# Function to get event categories
def get_event_categories(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

categories_element = soup.find("div", {"id": "topics"})

if categories_element:
categories = categories_element.find_all("a")
categories_list = [category.text for category in categories]
categories_text = ", ".join(categories_list)
else:
categories_text = "Categories not available"

return categories_text

# Counter for the number of collected events
event_count = 0
# Conjunto para almacenar URL de eventos ya recopiladas
collected_event_urls = set()
# Current page number
page_number = 1

while event_count < event_limit:
# Build the URL of the current page
url = f"{base_url}&page={page_number}"

# Send an HTTP request to get the page
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all elements containing event details
event_elements = soup.find_all("div", {"data-element-name": "categoryResults-eventCard"})

if not event_elements:
break

for event_element in event_elements:
# Find the image URL, event name, and other details
event_url_element = event_element.find("a", {"data-event-label": "Event card"})
event_url = event_url_element["href"]

# Verifica si ya hemos recopilado este evento
if event_url in collected_event_urls:
continue

image_url, description_text, location_name, location_info, map_link, event_categories = get_event_details(event_url)

event_name = event_element.find('h2', class_="text-gray7 font-medium text-base pb-1 pt-0 line-clamp-3").text.strip()
event_date = event_element.find("span").text.strip()
event_time_element = event_element.find("time")
if event_time_element:
event_time = event_time_element.find_all("span")[1].text.strip()
else:
event_time = "Time not available"
event_group = event_element.find("p", class_="text-gray6").text.strip()
# Añade la URL del evento al conjunto de eventos recopilados
collected_event_urls.add(event_url)
data.append([image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories])

event_count += 1

if event_count >= event_limit:
break

page_number += 1

# Save the data to a CSV file
with open('meetup_events_madrid.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Image URL", "Name", "Date", "Time", "Group", "Description", "Location Name", "Location Info", "Map Link", "Categories"])
for event in data:
writer.writerow(event)

Valora esta pregunta

Me gusta: Está pregunta es útil y esta clara

No me gusta: Está pregunta no esta clara o no es útil

Responder

Web Scrapping MeetUp

Publicado por Costero (93 intervenciones) el 19/12/2023 17:42:12

Que significa "No funciona".

El codigo no corre, corre pero nunca termina, los valores de la csv es incorrecto, etc?

Otra cosa cuando pones el codigo aqui usa codigo tag, para que no se pierda la identation.

Sal2

Valora esta respuesta

Me gusta: Está respuesta es útil y esta clara

No me gusta: Está respuesta no esta clara o no es útil

Comentar

Web Scrapping MeetUp

Publicado por mariona (17 intervenciones) el 20/12/2023 09:18:28

Sí disculpa. Pongo que no funciona porque cuándo lo hice (antes que el de EventBrite) funcionaba y me devoldía el csv con los valores. Pero no sé si se ha modificado algo en la web, creo que es la páginación ya que ahora hay como una especie de scroll infinito.

Entonces ahora no me devuelve nada en el csv ya que no termina de ejecutarse el código correctament.
No sé como debería ser el código que te mande para que me funcione con el tipo de paginación del scroll infinito.

Valora esta respuesta

Comentar

Web Scrapping MeetUp

Publicado por Costero (93 intervenciones) el 20/12/2023 19:21:49

Si, correctamente no hay pagination sino scroll infinito.

Usualmente (que yo sepa siempre, no hay de otra) con scroll infinito a una API que se llama para retornar los valores. Asi que de en vez de scrapear, llamamos directamente al API, y Voila.

Espero que sirva.

import csv

import os

import requests

import json

from datetime import datetime

num_of_events = 100

start_date = "2023-12-21"

# Directory to store the event images

image_dir = "event_images"

os.makedirs(image_dir, exist_ok=True)

# List to store event data

data = []

def get_date_and_time(datetime_str):

    datetime_object = datetime.strptime(

        datetime_str.rsplit(':', 2)[0], '%Y-%m-%dT%H:%M')

    return datetime_object.date(), datetime_object.time()

def get_event_fields(value):

    description = value['node']['description']

    description = description.replace(",", ";").replace("\n", "|")

    date, time = get_date_and_time(value['node']['dateTime'])

    if (value['node']['featuredEventPhoto']):

        image_url = value['node']['featuredEventPhoto']['highResUrl']

    else:

        image_url = "No disponible"

    location = value['node']['venue']['name'] + \

        "; " + value['node']['venue']['city']

    lat = value['node']['venue']['lat']

    lon = value['node']['venue']['lon']

    map_link = f"https://www.google.com/maps/search/?api=1&query={lat}%2C%20{lon}"

    event_group = value['node']['group']['name']

    event_name = value['node']['title']

    event_name = event_name.replace("\n", "|")

    event_url = value['node']['eventUrl']

    return description, date, time, image_url, location, lat, lon, map_link, event_group, event_name, event_url

event_request = f'''

{{

    "operationName": "recommendedEventsWithSeries",

    "variables": {{

        "first": {num_of_events},

        "lat": 40.41999816894531,

        "lon": -3.7100000381469727,

        "startDateRange": "{start_date}T18:39:37-05:00[US/Eastern]",

        "eventType": "PHYSICAL",

        "numberOfEventsForSeries": 10,

        "seriesStartDate": "{start_date}",

        "sortField": "RELEVANCE",

        "doConsolidateEvents": true

}},

    "extensions": {{

        "persistedQuery": {{

            "version": 1,

            "sha256Hash": "2461ce7745f8175aac6c500a5189fbc5a86e50b4603832f95036650c8b3fb697"

}}

}}

}}

'''

request_dict = json.loads(event_request)

r = requests.post('https://www.meetup.com/gql2', json=request_dict)

if (r.status_code == 200):

    # print(f"Response:\n\n {r.json()}")

    response_dict = json.loads(json.dumps(r.json()))

    for value in response_dict['data']['result']['edges']:

        description, date, time, image_url, location, lat, lon, map_link, event_group, event_name, event_url = get_event_fields(

            value)

        data.append([image_url, event_name, event_url,

                    date, time, event_group, description, location, map_link])

    # Save the data to a CSV file

    with open('meetup_events_madrid.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Image URL", "Name", "Event URL", "Date", "Time", "Group", "Description",

                        "Location", "Map Link"])

        for event in data:

            writer.writerow(event)

else:

    print(f"Something went wrong. Status code: {r.status_code}")

Salu2

Valora esta respuesta

Comentar

Web Scrapping MeetUp

Publicado por mariona (17 intervenciones) el 21/12/2023 15:18:45

Wow no sabía que se podía hacer así. Se lo agradezco mucho.
Espero no molestarle más, estoy muy agradecida por su ayuda!!

Valora esta respuesta

Comentar

Python

Scrapear info TikTok