Python - Web Scrapping EventBrite

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 28/11/2023 13:45:48

Hola, quiero scrappear eventos de EventBrite.
Quiero coger titulo evento, imagen del evento, fecha evento, hora evento, localización evento, descripción evento y categorías evento.

Estoy haciendo el código en PyCharm y tengo el siguiente código:

from selenium import webdriver
from bs4 import BeautifulSoup
import csv
from dateutil import parser
from datetime import datetime, timedelta
from dateutil import tz

# Definir la función para normalizar fechas
def normalize_date(input_date):
if "today" in input_date.lower():
return datetime.now().date()
elif "tomorrow" in input_date.lower():
return (datetime.now() + timedelta(days=1)).date()
elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):
# Encontramos un día de la semana, calculamos la próxima fecha
current_date = datetime.now().date()
days_until_next_day = (current_date.weekday() - ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(input_date.lower())) % 7
return (current_date + timedelta(days=days_until_next_day)).strftime("%a, %b %d")
try:
# Cambiar el formato de fecha para omitir las comillas
return datetime.strptime(input_date, "%a, %b %d").date().replace(year=datetime.now().year).strftime("%Y-%m-%d")
except ValueError:
try:
# Intentar convertir a formato de fecha directamente
return datetime.strptime(input_date, "%Y-%m-%d").date()
except ValueError:
return input_date # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

# URL base de Eventbrite para eventos en Madrid
base_url = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page=1"

# Inicializa el controlador web de Selenium
driver = webdriver.Chrome() # Necesitas tener Chrome y chromedriver instalados

# Número de páginas a buscar
num_pages = 5

# Conjunto para evitar eventos duplicados
seen_events = set()

# Lista para almacenar los datos de los eventos
data = []

for page_number in range(1, num_pages + 1):
# Construye la URL de la página actual
url = f"{base_url}{page_number}"

# Realiza la solicitud HTTP para obtener la página
driver.get(url)

# Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)
driver.implicitly_wait(10)

# Obtiene el contenido de la página después de que se haya ejecutado JavaScript
page_source = driver.page_source

# Parsea la página con BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Encuentra todos los elementos que contienen detalles de eventos
event_elements = soup.find_all("div", class_="Stack_root__1ksk7")

for event_element in event_elements:
# Encuentra el nombre del evento y la imagen directamente de la página principal
event_name_element = event_element.find("h2", class_="Typography_root__4bejd")
event_name = event_name_element.text.strip() if event_name_element else "Título no disponible"

# IMAGEN
image_element = soup.find("img", {"class": "event-card-image"})

# Extracta la URL de la imagen
image_url = image_element['src'] if image_element else "URL de imagen no disponible"
# print(image_url)

# URL DEL EVENTO
url_element = event_element.find("a", class_="event-card-link")
event_url = url_element['href'] if url_element else "URL de evento no disponible"

# DESCRIPCIÓN
description = event_url # Utiliza la URL del evento como descripción

# LOCALIZACIÓN
# Encuentra el elemento del párrafo de la ubicación
location_element = soup.find("p",
class_="Typography_root__4bejd #585163 Typography_body-md__4bejd event-card__clamp-line--one Typography_align-match-parent__4bejd")

# Extrae el texto de la ubicación sin las etiquetas HTML y sin la clase
location = location_element.text.strip() if location_element else "Information not available"

# FECHA
date_element = soup.find("p",
class_="Typography_root__4bejd #3a3247 Typography_body-md-bold__4bejd Typography_align-match-parent__4bejd")

# Extrae el texto de la fecha sin las etiquetas HTML y sin la clase
date_text = date_element.text.strip() if date_element else "Fecha no disponible"

# Procesar la fecha para obtener el formato deseado
date_parts = date_text.split('•')
date = normalize_date(date_parts[0].strip()) if date_parts else normalize_date(date_text)

# HORA
time = date_parts[1].strip() if date_parts else date_text

# Convertir a formato de 24 horas y manejar la zona horaria
try:
# Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria
parsed_time = parser.parse(time)
# Convertir la hora a formato de 24 horas
time = parsed_time.strftime("%H:%M")
except ValueError:
# En caso de error al convertir, dejar el valor original
pass

# Verifica si el evento ya ha sido procesado
if event_name not in seen_events:
# Agrega los datos a la lista y al conjunto
data.append([event_name, image_url, event_url, location, date, time])
seen_events.add(event_name)

# Guardar datos en CSV
with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Name", "Image URL", "Event URL", "Location", "Date", "Time"])
for event in data:
writer.writerow(event)

print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

# Cerrar el controlador de Selenium
driver.quit()

Pero no consigo acceder a la url de cada evento para coger información de dentro de su url.
Por ejemplo, la variable descripción del evento y categorias evento las tendría que coger de dentro de la url del evento, no de la url base con el listado de los eventos.

No sé cómo hacer el código, ya que no soy programadora.
Si alguien me pudiera ayudar le estaría muy agradecida.

Valora esta pregunta

Me gusta: Está pregunta es útil y esta clara

No me gusta: Está pregunta no esta clara o no es útil

Responder

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 28/11/2023 22:05:33

Trata este.

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from dateutil import tz

# Definir la función para normalizar fechas

def normalize_date(input_date):

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        # Encontramos un día de la semana, calculamos la próxima fecha

        current_date = datetime.now().date()

        days_until_next_day = (current_date.weekday(

        ) - ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(input_date.lower())) % 7

        return (current_date + timedelta(days=days_until_next_day)).strftime("%a, %b %d")

    try:

        # Cambiar el formato de fecha para omitir las comillas

        return datetime.strptime(input_date, "%a, %b %d").date().replace(year=datetime.now().year).strftime("%Y-%m-%d")

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

# URL base de Eventbrite para eventos en Madrid

base_url = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

# Inicializa el controlador web de Selenium

# Necesitas tener Chrome y chromedriver instalados

driver = webdriver.Chrome()

# driver = webdriver.Firefox()

# Número de páginas a buscar

num_pages = 5

# Conjunto para evitar eventos duplicados

seen_events = set()

# Lista para almacenar los datos de los eventos

data = []

for page_number in range(1, num_pages + 1):

    # Construye la URL de la página actual

    url = f"{base_url}{page_number}"

    # print(f'\n===> url: {url}')

    # Realiza la solicitud HTTP para obtener la página

    driver.get(url)

    # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

    driver.implicitly_wait(10)

    # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

    page_source = driver.page_source

    # Parsea la página con BeautifulSoup

    soup = BeautifulSoup(page_source, 'html.parser')

    # Encuentra todos los elementos que contienen detalles de eventos

    event_elements = soup.find_all("div", class_="Stack_root__1ksk7")

    for event_element in event_elements:

        # Encuentra el nombre del evento y la imagen directamente de la página principal

        event_name_element = event_element.find(

            "h2", class_="Typography_root__4bejd")

        event_name = event_name_element.text.strip(

        ) if event_name_element else "Título no disponible"

        # IMAGEN

        image_element = soup.find("img", {"class": "event-card-image"})

        # Extracta la URL de la imagen

        image_url = image_element['src'] if image_element else "URL de imagen no disponible"

        # print(image_url)

        # URL DEL EVENTO

        url_element = event_element.find("a", class_="event-card-link")

        event_url = url_element['href'] if url_element else "URL de evento no disponible"

        # DESCRIPCIÓN

        # Utiliza la URL del evento como descripción

        categoria = "n/a"

        description = "No hay descriccion"

        if "URL de evento no disponible" not in event_url:

            # print(f"\n===> Event url: {event_url}")

            driver.get(event_url)

            driver.implicitly_wait(10)

            page_source = driver.page_source

            soup_evento = BeautifulSoup(page_source, 'html.parser')

            summary_element = soup_evento.find("p", class_="summary")

            description = summary_element.text.strip(

            ) if summary_element else "No hay descricion"

            # print(f"==> {description}")

            # Encontrar las categorias

            tags_element = soup_evento.findAll("a", class_="tags-link")

            categorias = []

            for tag in tags_element:

                # print(f"tag: {tag.text}")

                categorias.append(tag.text)

            categoria = "|".join(categorias)

        # LOCALIZACIÓN

        # Encuentra el elemento del párrafo de la ubicación

        location_element = soup.find("p",

                                     class_="Typography_root__4bejd #585163 Typography_body-md__4bejd event-card__clamp-line--one Typography_align-match-parent__4bejd")

        # Extrae el texto de la ubicación sin las etiquetas HTML y sin la clase

        location = location_element.text.strip(

        ) if location_element else "Information not available"

        # FECHA

        date_element = soup.find("p",

                                 class_="Typography_root__4bejd #3a3247 Typography_body-md-bold__4bejd Typography_align-match-parent__4bejd")

        # Extrae el texto de la fecha sin las etiquetas HTML y sin la clase

        date_text = date_element.text.strip() if date_element else "Fecha no disponible"

        # Procesar la fecha para obtener el formato deseado

        date_parts = date_text.split('•')

        date = normalize_date(date_parts[0].strip(

        )) if date_parts else normalize_date(date_text)

        # HORA

        time = date_parts[1].strip() if date_parts else date_text

        # Convertir a formato de 24 horas y manejar la zona horaria

        try:

            # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

            parsed_time = parser.parse(time)

            # Convertir la hora a formato de 24 horas

            time = parsed_time.strftime("%H:%M")

        except ValueError:

            # En caso de error al convertir, dejar el valor original

            pass

        # Verifica si el evento ya ha sido procesado

        if event_name not in seen_events:

            # Agrega los datos a la lista y al conjunto

            data.append(

                [event_name, image_url, event_url, location, date, time, description, categoria])

            seen_events.add(event_name)

# Guardar datos en CSV

with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

    writer = csv.writer(csvfile)

    writer.writerow(["Name", "Image URL", "Event URL",

                    "Location", "Date", "Time", "Description", "Categoria"])

    for event in data:

        writer.writerow(event)

print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

# Cerrar el controlador de Selenium

driver.quit()

Suerte

Valora esta respuesta

Me gusta: Está respuesta es útil y esta clara

No me gusta: Está respuesta no esta clara o no es útil

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 29/11/2023 09:24:01

Muchas gracias por tu ayuda! Parece que funciona bien el código.
Te lo agradezco mucho.

Una última pregunta, como puedo poner una condición para que me extraiga solo información de eventos que vayan a transcurrir a partir de 15 días en adelante?
Y otra consulta, para las fechas. Cuando tienen formato texto (saturday, sunday, tomorrow,...) me devuelve bien la fecha con formato Tomorrow • 10:00 AM GMT+9,2023-11-30 y no se como eliminar la parte del Tomorrow • 10:00 AM GMT+9.
Y lo mismo me pasa con el formato Wed, 06 Dec que me devuelve la fecha en formato 2023-12-06 pero me escribe por pantalla ambas cosas.

Siento las molestias,

Muchas gracias

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 29/11/2023 18:30:49

Este debe de hacer lo que pides.

Hay una nueva variable para dias en adelante,

forward_days = 5

Actualmente tiene 5 dias, puedes cambiarlo a 15 o cualquier otro numero

Si lo cambias a 15 quizas entonces tengas cambiar el numero de pages a mas, o no tendras data en la csv.

Suerte

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from dateutil import tz

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"La date no es date type: {date}")

        return False

def normalize_date(input_date):

    '''Definir la función para normalizar fechas'''

    # print(f"===> input_date: [{input_date}]")

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        # Encontramos un día de la semana, calculamos la próxima fecha

        current_date = datetime.now().date()

        days_until_next_day = (current_date.weekday(

        ) - ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(input_date.lower())) % 7

        # return (current_date + timedelta(days=days_until_next_day)).strftime("%a, %b %d")

        return (datetime.now() + timedelta(days=days_until_next_day)).date()

    try:

        # Cambiar el formato de fecha para omitir las comillas

        value = datetime.strptime(input_date, "%a, %b %d").date().replace(

            year=datetime.now().year).strftime("%Y-%m-%d")

        return datetime.strptime(value, "%Y-%m-%d").date()

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            value = datetime.strptime(input_date, "%a, %d %b").date().replace(

                year=datetime.now().year).strftime("%Y-%m-%d")

            return datetime.strptime(value, "%Y-%m-%d").date()

            # return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

def main():

    # URL base de Eventbrite para eventos en Madrid

    base_url = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # driver = webdriver.Firefox()

    # Número de páginas a buscar

    num_pages = 4

    # Dias en adelanto

    forward_days = 5

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        driver.implicitly_wait(10)

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_="Stack_root__1ksk7")

        for event_element in event_elements:

            # Encuentra el nombre del evento y la imagen directamente de la página principal

            event_name_element = event_element.find(

                "h2", class_="Typography_root__4bejd")

            event_name = event_name_element.text.strip(

            ) if event_name_element else "Título no disponible"

            # IMAGEN

            image_element = soup.find("img", {"class": "event-card-image"})

            # Extracta la URL de la imagen

            image_url = image_element['src'] if image_element else "URL de imagen no disponible"

            # print(image_url)

            # URL DEL EVENTO

            url_element = event_element.find("a", class_="event-card-link")

            event_url = url_element['href'] if url_element else "URL de evento no disponible"

            # DESCRIPCIÓN

            # Utiliza la URL del evento como descripción

            categoria = "n/a"

            description = "No hay descriccion"

            if "URL de evento no disponible" not in event_url:

                # print(f"\n===> Event url: {event_url}")

                driver.get(event_url)

                driver.implicitly_wait(10)

                page_source = driver.page_source

                soup_evento = BeautifulSoup(page_source, 'html.parser')

                summary_element = soup_evento.find("p", class_="summary")

                description = summary_element.text.strip(

                ) if summary_element else "No hay descricion"

                # print(f"==> {description}")

                # Encontrar las categorias

                tags_element = soup_evento.findAll("a", class_="tags-link")

                categorias = []

                for tag in tags_element:

                    # print(f"tag: {tag.text}")

                    categorias.append(tag.text)

                categoria = "|".join(categorias)

            # LOCALIZACIÓN

            # Encuentra el elemento del párrafo de la ubicación

            location_element = soup.find("p",

                                         class_="Typography_root__4bejd #585163 Typography_body-md__4bejd event-card__clamp-line--one Typography_align-match-parent__4bejd")

            # Extrae el texto de la ubicación sin las etiquetas HTML y sin la clase

            location = location_element.text.strip(

            ) if location_element else "Information not available"

            # FECHA

            date_element = soup.find("p",

                                     class_="Typography_root__4bejd #3a3247 Typography_body-md-bold__4bejd Typography_align-match-parent__4bejd")

            # Extrae el texto de la fecha sin las etiquetas HTML y sin la clase

            date_text = date_element.text.strip() if date_element else "Fecha no disponible"

            # Procesar la fecha para obtener el formato deseado

            date_parts = date_text.split('•')

            date = normalize_date(date_parts[0].strip(

            )) if date_parts else normalize_date(date_text)

            print(f"Normalize date: [{date}]")

            # HORA

            time = date_parts[1].strip() if date_parts else date_text

            # Convertir a formato de 24 horas y manejar la zona horaria

            try:

                # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

                parsed_time = parser.parse(time)

                # Convertir la hora a formato de 24 horas

                time = parsed_time.strftime("%H:%M")

            except ValueError:

                # En caso de error al convertir, dejar el valor original

                print(f"-- Error parsing {time}")

                pass

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events and is_date_ok(date, forward_days):

                print(f"===> Event name: [{event_name}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, date, time, description, categoria])

                seen_events.add(event_name)

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Date", "Time", "Description", "Categoria"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 30/11/2023 09:27:01

Sí que funciona, pero hay eventos que tienen formato fecha : 5:00 PM + 7 more
(que no sé porque porque les puse condición de formato 24h) y entonces estos eventos no aparecen.

No entiendo porqué en la variable Location me devuelve para todos los eventos: Parroquia de san Bruno y la misma hora para todos los eventos...
Es raro.

Gracias por tu ayuda igualmente. Te lo agradezco mucho

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 30/11/2023 18:57:08

Arregle algunos bugs que tenia el codigo original. La location debe de funcionar ahora. Solo puse la locaccion no la direccion.

El programa esta disen~ado para entender date como: Tomorrow, monday, etc. or una fecha, pero si ve algo como: 5:00 PM + 7 more or un texto, no sabe que hacer y no pone ese evento.

Tambien puse debug print para ver lo que pasa

Suerte

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from dateutil import tz

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.firefox.options import Options

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        # print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"La date no es date type: {date}")

        return False

def normalize_date(input_date):

    '''Definir la función para normalizar fechas'''

    # print(f"===> input_date: [{input_date}]")

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        # Encontramos un día de la semana, calculamos la próxima fecha

        current_date = datetime.now().date()

        days_until_next_day = (current_date.weekday(

        ) - ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"].index(input_date.lower())) % 7

        # return (current_date + timedelta(days=days_until_next_day)).strftime("%a, %b %d")

        return (datetime.now() + timedelta(days=days_until_next_day)).date()

    try:

        # Cambiar el formato de fecha para omitir las comillas

        value = datetime.strptime(input_date, "%a, %b %d").date().replace(

            year=datetime.now().year).strftime("%Y-%m-%d")

        return datetime.strptime(value, "%Y-%m-%d").date()

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            value = datetime.strptime(input_date, "%a, %d %b").date().replace(

                year=datetime.now().year).strftime("%Y-%m-%d")

            return datetime.strptime(value, "%Y-%m-%d").date()

            # return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

def main():

    # URL base de Eventbrite para eventos en Madrid

    base_url = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # options = Options()

    # options.headless = False

    # driver = webdriver.Firefox(options=options)

    # Número de páginas a buscar

    num_pages = 5

    # Dias en adelanto

    forward_days = 15

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        # driver.implicitly_wait(10)

        WebDriverWait(driver, 10).until(

            EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_="Stack_root__1ksk7")

        for event_element in event_elements:

            # FECHA

            date_element = event_element.find("p",

                                              class_="Typography_root__4bejd #585163 Typography_body-md__4bejd event-card__clamp-line--one Typography_align-match-parent__4bejd")

            # Extrae el texto de la fecha sin las etiquetas HTML y sin la clase

            date_text = date_element.text.strip() if date_element else "Fecha no disponible"

            # Encuentra el nombre del evento y la imagen directamente de la página principal

            event_name_element = event_element.find(

                "h2", class_="Typography_root__4bejd")

            event_name = event_name_element.text.strip(

            ) if event_name_element else "Título no disponible"

            print(

                f"\n----- Date_text: [{date_text}] and event name: [{event_name}]")

            # Procesar la fecha para obtener el formato deseado

            date_parts = date_text.split('•')

            date = normalize_date(date_parts[0].strip(

            )) if date_parts else normalize_date(date_text)

            print(f"Normalize date: [{date}] and date_text: [{date_text}]")

            if not is_date_ok(date, forward_days):

                continue

            # HORA

            time = date_parts[1].strip() if date_parts else date_text

            # IMAGEN

            image_element = event_element.find(

                "img", {"class": "event-card-image"})

            # Extracta la URL de la imagen

            image_url = image_element['src'] if image_element else "URL de imagen no disponible"

            # print(image_url)

            # URL DEL EVENTO

            url_element = event_element.find("a", class_="event-card-link")

            event_url = url_element['href'] if url_element else "URL de evento no disponible"

            # DESCRIPCIÓN

            # Utiliza la URL del evento como descripción

            categoria = "n/a"

            description = "No hay descriccion"

            location = "No hay locaccion"

            if "URL de evento no disponible" not in event_url:

                # print(f"\n===> Event url: {event_url}")

                driver.get(event_url)

                # driver.implicitly_wait(10)

                page_source = driver.page_source

                WebDriverWait(driver, 10).until(

                    EC.presence_of_element_located((By.CLASS_NAME, "summary")))

                soup_evento = BeautifulSoup(page_source, 'html.parser')

                summary_element = soup_evento.find("p", class_="summary")

                description = summary_element.text.strip(

                ) if summary_element else "No hay descricion"

                location_element = soup_evento.find(

                    "p", class_="location-info__address-text")

                location = location_element.text.strip(

                ) if location_element else "No hay locaccion"

                # print(f"==> {description}")

                # Encontrar las categorias

                tags_element = soup_evento.findAll("a", class_="tags-link")

                categorias = []

                for tag in tags_element:

                    # print(f"tag: {tag.text}")

                    categorias.append(tag.text)

                categoria = "|".join(categorias)

            # Convertir a formato de 24 horas y manejar la zona horaria

            try:

                # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

                parsed_time = parser.parse(time)

                # Convertir la hora a formato de 24 horas

                time = parsed_time.strftime("%H:%M")

            except ValueError:

                # En caso de error al convertir, dejar el valor original

                print(f"-- Error parsing {time}")

                pass

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events:

                print(f"===> Event name: [{event_name}] and date: [{date}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, date, time, description, categoria])

                seen_events.add(event_name)

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Date", "Time", "Description", "Categoria"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 01/12/2023 06:17:52

He notado que BeatifulSoup a veces no coge la fecha sino el texto, asi que ese record no se pone a la csv.
El fin de semana le doy un vistazo

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 01/12/2023 10:23:46

Muchas gracias por tu ayuda

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 03/12/2023 02:45:59

Ok, BeatifulSoup no tiene nigun bug. Es el website que no es consistente con los valores del class para las fechas (cambia un numero). Tuve que hacer un truco para arreglar el problema. Ahora encuentra todas las fechas.
Si el evento no tiene fecha tambien se an~ade a la cvs con la fecha columna diciendo "Fecha no disponible", creo que eso era lo que tu querias.
Para los valores monday, tuesday, etc. el programa no estaba calculando correctamente la fecha, asi que puse mi solucion

Suerte

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.firefox.options import Options

NUMERO_DE_PAGINAS_A_BUSCAR = 5

DIAS_EN_ADELANTO = 4

BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events/?page="

# BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

FECHA_NO_DISPONIBLE = "Fecha no disponible"

HORA_NO_DISPONIBLE = "Hora no disponible"

TITULO_NO_DISPONIBLE = "Título no disponible"

URL_IMAGEN_NO_DISPONIBLE = "URL de imagen no disponible"

URL_EVENTO_NO_DISPONIBLE = "URL de evento no disponible"

NO_HAY_DESCRICCION = "No hay descricion"

NO_HAY_LOCACCION = "No hay locaccion"

CLASS_EVENT_NAME = "Typography_root__4bejd"

CLASS_URL_EVENT = "event-card-link"

CLASS_IMAGE_EVENT = "event-card-image"

CLASS_DATE_EVENT = "Typography_root__4bejd"

CLASS_SUMMARY = "summary"

CLASS_TAGS = "tags-link"

CLASS_EVENT_ELEMENT = "Stack_root__1ksk7"

def get_event_time(date_parts, date_text):

    '''Retorna la hora del evento'''

    time = date_parts[1].strip() if date_parts else date_text

    # Convertir a formato de 24 horas y manejar la zona horaria

    try:

        # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

        parsed_time = parser.parse(time)

        # Convertir la hora a formato de 24 horas

        time = parsed_time.strftime("%H:%M")

    except ValueError:

        # En caso de error al convertir, dejar el valor original

        print(f"-- Error parsing [{time}]")

        time = "00:00"

    return time

def get_event_date(date_elements, event_name):

    '''Retorna la fecha del evento'''

    date = None

    date_text = None

    date_parts = None

    for delement in date_elements:

        date_text = delement.text.strip() if delement else FECHA_NO_DISPONIBLE

        print(

            f"-----+++> event name: [{event_name}] and Date_text: [{date_text}]")

        date_parts = date_text.split('•')

        date = normalize_date(date_parts[0].strip(

        )) if date_parts else normalize_date(date_text)

        if date.__class__.__name__ == 'date':

            break

    return date, date_text, date_parts

def get_event_date_time(event_element, event_name, forward_days):

    '''Retorna el evento fecha y tiempo'''

    date_elements = event_element.find_all(

        "p", class_=CLASS_DATE_EVENT)

    date, date_text, date_parts = get_event_date(

        date_elements=date_elements, event_name=event_name)

    time = HORA_NO_DISPONIBLE

    if date.__class__.__name__ == 'date':

        if is_date_ok(date, forward_days):

            # HORA

            time = get_event_time(date_parts=date_parts, date_text=date_text)

        else:

            print(

                f"---> Removing because date [{date}] is less than date + forward days: [{forward_days}]")

            date = None

    else:

        date = FECHA_NO_DISPONIBLE

    return date, time

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        # print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"---> La fecha no es date type: {date}")

        return False

def get_date_from_weekday(input_date):

    '''Calculamos la fecha al dia de la semana'''

    days_of_week = {

        0: "monday",

        1: "tuesday",

        2: "wednesday",

        3: "thursday",

        4: "friday",

        5: "saturday",

        6: "sunday",

    current_date = datetime.now().date()

    for i in range(6):

        current_date = current_date + timedelta(days=1)

        value = days_of_week[current_date.weekday()]

        if input_date.lower() == value:

            break

    return current_date

def normalize_date(input_date):

    '''Definir la función para normalizar fechas'''

    # print(f"===> input_date: [{input_date}]")

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        return get_date_from_weekday(input_date=input_date)

    try:

        # Cambiar el formato de fecha para omitir las comillas

        value = datetime.strptime(input_date, "%a, %b %d").date().replace(

            year=datetime.now().year).strftime("%Y-%m-%d")

        return datetime.strptime(value, "%Y-%m-%d").date()

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            value = datetime.strptime(input_date, "%a, %d %b").date().replace(

                year=datetime.now().year).strftime("%Y-%m-%d")

            return datetime.strptime(value, "%Y-%m-%d").date()

            # return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

def get_cat_desc_location(driver, event_url):

    '''Retorna la categorias, descriccion y locaccion del evento'''

    categoria = "n/a"

    description = "No hay descriccion"

    location = "No hay locaccion"

    if "URL de evento no disponible" not in event_url:

        # print(f"\n===> Event url: {event_url}")

        driver.get(event_url)

        # driver.implicitly_wait(10)

        page_source = driver.page_source

        WebDriverWait(driver, 10).until(

            EC.presence_of_element_located((By.CLASS_NAME, CLASS_SUMMARY)))

        soup_evento = BeautifulSoup(page_source, 'html.parser')

        summary_element = soup_evento.find("p", class_=CLASS_SUMMARY)

        description = summary_element.text.strip(

        ) if summary_element else NO_HAY_DESCRICCION

        location_element = soup_evento.find(

            "p", class_="location-info__address-text")

        location = location_element.text.strip(

        ) if location_element else NO_HAY_LOCACCION

        # print(f"==> {description}")

        # Encontrar las categorias

        tags_element = soup_evento.findAll("a", class_=CLASS_TAGS)

        categorias = []

        for tag in tags_element:

            # print(f"tag: {tag.text}")

            categorias.append(tag.text)

        categoria = "|".join(categorias)

    return categoria, description, location

def main():

    # URL base de Eventbrite para eventos en Madrid

    # base_url = BASE_URL

    base_url = BASE_URL

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # options = Options()

    # options.headless = True

    # driver = webdriver.Firefox(options=options)

    # Número de páginas a buscar

    num_pages = NUMERO_DE_PAGINAS_A_BUSCAR

    # Dias en adelanto

    forward_days = DIAS_EN_ADELANTO

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        driver.implicitly_wait(10)

        # WebDriverWait(driver, 10).until(

        #     EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # with open('output.txt', 'a') as f:

        #     f.write(page_source)

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_=CLASS_EVENT_ELEMENT)

        print(f"====> Number of rows found: {len(event_elements)}")

        for event_element in event_elements:

            # Event name

            event_name_element = event_element.find(

                "h2", class_=CLASS_EVENT_NAME)

            event_name = event_name_element.text.strip(

            ) if event_name_element else TITULO_NO_DISPONIBLE

            if TITULO_NO_DISPONIBLE in event_name:

                continue

            # Event date and time

            date, time = get_event_date_time(

                event_element=event_element, event_name=event_name, forward_days=forward_days)

            if date is None:

                continue

            # IMAGEN

            image_element = event_element.find(

                "img", {"class": CLASS_IMAGE_EVENT})

            # Extracta la URL de la imagen

            image_url = image_element['src'] if image_element else URL_IMAGEN_NO_DISPONIBLE

            # print(image_url)

            # URL DEL EVENTO

            url_element = event_element.find("a", class_=CLASS_URL_EVENT)

            event_url = url_element['href'] if url_element else URL_EVENTO_NO_DISPONIBLE

            categoria, description, location = get_cat_desc_location(

                driver, event_url)

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events:

                print(

                    f"===> Adding event name: [{event_name}] and date: [{date}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, date, time, description, categoria])

                seen_events.add(event_name)

            else:

                print(

                    f"----> Already added event name: [{event_name}] and date: [{date}]")

            print("\n ---")

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Date", "Time", "Description", "Categoria"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

if __name__ == "__main__":

    main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 04/12/2023 10:12:30

Muchísimas gracias por tu ayuda, me ha servido de mucho!

Te lo agradezco

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 04/12/2023 11:58:29

Verdad que no hay manera de conseguir el link del mapa con la dirección de la localización?
Estoy tratando de sacar una variable con el link del mapa pero no veo en la estructura html de la web ningún link para extraer...

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 04/12/2023 18:39:31

Hay una nueva columna en el csv "address" con la direccion.

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.firefox.options import Options

NUMERO_DE_PAGINAS_A_BUSCAR = 1

DIAS_EN_ADELANTO = 4

BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events/?page="

# BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

FECHA_NO_DISPONIBLE = "Fecha no disponible"

HORA_NO_DISPONIBLE = "Hora no disponible"

TITULO_NO_DISPONIBLE = "Título no disponible"

URL_IMAGEN_NO_DISPONIBLE = "URL de imagen no disponible"

URL_EVENTO_NO_DISPONIBLE = "URL de evento no disponible"

NO_HAY_DESCRICCION = "No hay descricion"

NO_HAY_LOCACCION = "No hay locaccion"

NO_HAY_DIRECCION = "No hay direccion"

CLASS_EVENT_NAME = "Typography_root__4bejd"

CLASS_URL_EVENT = "event-card-link"

CLASS_IMAGE_EVENT = "event-card-image"

CLASS_DATE_EVENT = "Typography_root__4bejd"

CLASS_SUMMARY = "summary"

CLASS_TAGS = "tags-link"

CLASS_EVENT_ELEMENT = "Stack_root__1ksk7"

CLASS_LOCATION = "location-info__address-text"

CLASS_ADDRESS = "location-info__address"

def get_event_time(date_parts, date_text):

    '''Retorna la hora del evento'''

    time = date_parts[1].strip() if date_parts else date_text

    # Convertir a formato de 24 horas y manejar la zona horaria

    try:

        # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

        parsed_time = parser.parse(time)

        # Convertir la hora a formato de 24 horas

        time = parsed_time.strftime("%H:%M")

    except ValueError:

        # En caso de error al convertir, dejar el valor original

        print(f"-- Error parsing [{time}]")

        time = "00:00"

    return time

def get_event_date(date_elements, event_name):

    '''Retorna la fecha del evento'''

    date = None

    date_text = None

    date_parts = None

    for delement in date_elements:

        date_text = delement.text.strip() if delement else FECHA_NO_DISPONIBLE

        print(

            f"-----+++> event name: [{event_name}] and Date_text: [{date_text}]")

        date_parts = date_text.split('•')

        date = normalize_date(date_parts[0].strip(

        )) if date_parts else normalize_date(date_text)

        if date.__class__.__name__ == 'date':

            break

    return date, date_text, date_parts

def get_event_date_time(event_element, event_name, forward_days):

    '''Retorna el evento fecha y tiempo'''

    date_elements = event_element.find_all(

        "p", class_=CLASS_DATE_EVENT)

    date, date_text, date_parts = get_event_date(

        date_elements=date_elements, event_name=event_name)

    time = HORA_NO_DISPONIBLE

    if date.__class__.__name__ == 'date':

        if is_date_ok(date, forward_days):

            # HORA

            time = get_event_time(date_parts=date_parts, date_text=date_text)

        else:

            print(

                f"---> Removing because date [{date}] is less than date + forward days: [{forward_days}]")

            date = None

    else:

        date = FECHA_NO_DISPONIBLE

    return date, time

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        # print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"---> La fecha no es date type: {date}")

        return False

def get_date_from_weekday(input_date):

    '''Calculamos la fecha al dia de la semana'''

    days_of_week = {

        0: "monday",

        1: "tuesday",

        2: "wednesday",

        3: "thursday",

        4: "friday",

        5: "saturday",

        6: "sunday",

    current_date = datetime.now().date()

    for i in range(6):

        current_date = current_date + timedelta(days=1)

        value = days_of_week[current_date.weekday()]

        if input_date.lower() == value:

            break

    return current_date

def normalize_date(input_date):

    '''Definir la función para normalizar fechas'''

    # print(f"===> input_date: [{input_date}]")

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        return get_date_from_weekday(input_date=input_date)

    try:

        # Cambiar el formato de fecha para omitir las comillas

        value = datetime.strptime(input_date, "%a, %b %d").date().replace(

            year=datetime.now().year).strftime("%Y-%m-%d")

        return datetime.strptime(value, "%Y-%m-%d").date()

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            value = datetime.strptime(input_date, "%a, %d %b").date().replace(

                year=datetime.now().year).strftime("%Y-%m-%d")

            return datetime.strptime(value, "%Y-%m-%d").date()

            # return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

def get_cat_desc_location(driver, event_url):

    '''Retorna la categorias, descriccion y locaccion del evento'''

    categoria = "n/a"

    description = "No hay descriccion"

    location = "No hay locaccion"

    if "URL de evento no disponible" not in event_url:

        # print(f"\n===> Event url: {event_url}")

        driver.get(event_url)

        # driver.implicitly_wait(10)

        page_source = driver.page_source

        WebDriverWait(driver, 10).until(

            EC.presence_of_element_located((By.CLASS_NAME, CLASS_SUMMARY)))

        soup_evento = BeautifulSoup(page_source, 'html.parser')

        summary_element = soup_evento.find("p", class_=CLASS_SUMMARY)

        description = summary_element.text.strip(

        ) if summary_element else NO_HAY_DESCRICCION

        location_element = soup_evento.find(

            "p", class_=CLASS_LOCATION)

        location = location_element.text.strip(

        ) if location_element else NO_HAY_LOCACCION

        address_element = soup_evento.find(

            "div", class_=CLASS_ADDRESS)

        address = address_element.text.strip(

        ) if address_element else NO_HAY_DIRECCION

        # Encontrar las categorias

        tags_element = soup_evento.findAll("a", class_=CLASS_TAGS)

        categorias = []

        for tag in tags_element:

            # print(f"tag: {tag.text}")

            categorias.append(tag.text)

        categoria = "|".join(categorias)

    return categoria, description, location, address

def main():

    # URL base de Eventbrite para eventos en Madrid

    # base_url = BASE_URL

    base_url = BASE_URL

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # options = Options()

    # options.headless = False

    # driver = webdriver.Firefox(options=options)

    # Número de páginas a buscar

    num_pages = NUMERO_DE_PAGINAS_A_BUSCAR

    # Dias en adelanto

    forward_days = DIAS_EN_ADELANTO

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        driver.implicitly_wait(10)

        # WebDriverWait(driver, 10).until(

        #     EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # with open('output.txt', 'a') as f:

        #     f.write(page_source)

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_=CLASS_EVENT_ELEMENT)

        print(f"====> Number of rows found: {len(event_elements)}")

        for event_element in event_elements:

            # Event name

            event_name_element = event_element.find(

                "h2", class_=CLASS_EVENT_NAME)

            event_name = event_name_element.text.strip(

            ) if event_name_element else TITULO_NO_DISPONIBLE

            if TITULO_NO_DISPONIBLE in event_name:

                continue

            # Event date and time

            date, time = get_event_date_time(

                event_element=event_element, event_name=event_name, forward_days=forward_days)

            if date is None:

                continue

            # IMAGEN

            image_element = event_element.find(

                "img", {"class": CLASS_IMAGE_EVENT})

            # Extracta la URL de la imagen

            image_url = image_element['src'] if image_element else URL_IMAGEN_NO_DISPONIBLE

            # print(image_url)

            # URL DEL EVENTO

            url_element = event_element.find("a", class_=CLASS_URL_EVENT)

            event_url = url_element['href'] if url_element else URL_EVENTO_NO_DISPONIBLE

            categoria, description, location, address = get_cat_desc_location(

                driver, event_url)

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events:

                print(

                    f"===> Adding event name: [{event_name}] and date: [{date}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, address, date, time, description, categoria])

                seen_events.add(event_name)

            else:

                print(

                    f"----> Already added event name: [{event_name}] and date: [{date}]")

            print("\n ---")

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Address", "Date", "Time", "Description", "Categoria"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

if __name__ == "__main__":

    main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 05/12/2023 09:34:12

De acuerdo,

Entonces es verddad que el link del mapa es imposible cogerlo, y cogemos la location info como adress.
Habría alguna manera para que me separara location name de location info? Porque en adress me devuelve ambas cosas.
He intentado que me coja solo la última parte pero me devuelve No hay direccion para todos los casos.
address_element = soup_evento.find("div", class_="location-info__address")

if address_element:
address_lines = address_element.text.strip().split('\n')
if len(address_lines) >= 2:
address = address_lines[1].strip()
else:
address = NO_HAY_DIRECCION
else:
address = NO_HAY_DIRECCION

Probé esto y nada.

Gracias igualmente, me has ayudado mucho

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 05/12/2023 10:44:56

Igual no obstante, podría sacar en una columna la latitud y en otra columna la longitud como solución a no poder obtener el map_link para luego poder generarlo.

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 05/12/2023 11:03:38

He intentado esto para extraer una columna con la latitud y otra con la longitud pero no me funciona:

from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import json
from dateutil import parser
from datetime import datetime, timedelta
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options

NUMERO_DE_PAGINAS_A_BUSCAR = 1
DIAS_EN_ADELANTO = 4
BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events/?page="
# BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

FECHA_NO_DISPONIBLE = "Fecha no disponible"
HORA_NO_DISPONIBLE = "Hora no disponible"
TITULO_NO_DISPONIBLE = "Título no disponible"
URL_IMAGEN_NO_DISPONIBLE = "URL de imagen no disponible"
URL_EVENTO_NO_DISPONIBLE = "URL de evento no disponible"
NO_HAY_DESCRICCION = "No hay descricion"
NO_HAY_LOCACCION = "No hay locaccion"
NO_HAY_DIRECCION = "No hay direccion"

CLASS_EVENT_NAME = "Typography_root__4bejd"
CLASS_URL_EVENT = "event-card-link"
CLASS_IMAGE_EVENT = "event-card-image"
CLASS_DATE_EVENT = "Typography_root__4bejd"
CLASS_SUMMARY = "summary"
CLASS_TAGS = "tags-link"
CLASS_EVENT_ELEMENT = "Stack_root__1ksk7"
CLASS_LOCATION = "location-info__address-text"
CLASS_ADDRESS = "location-info__address"

def get_event_time(date_parts, date_text):
'''Retorna la hora del evento'''
time = date_parts[1].strip() if date_parts else date_text

# Convertir a formato de 24 horas y manejar la zona horaria
try:
# Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria
parsed_time = parser.parse(time)

# Convertir la hora a formato de 24 horas
time = parsed_time.strftime("%H:%M")

except ValueError:
# En caso de error al convertir, dejar el valor original
print(f"-- Error parsing [{time}]")
time = "00:00"
return time

def get_event_date(date_elements, event_name):
'''Retorna la fecha del evento'''
date = None
date_text = None
date_parts = None
for delement in date_elements:
date_text = delement.text.strip() if delement else FECHA_NO_DISPONIBLE
print(
f"-----+++> event name: [{event_name}] and Date_text: [{date_text}]")
date_parts = date_text.split('•')
date = normalize_date(date_parts[0].strip(
)) if date_parts else normalize_date(date_text)
if date.__class__.__name__ == 'date':
break
return date, date_text, date_parts

def get_event_date_time(event_element, event_name, forward_days):
'''Retorna el evento fecha y tiempo'''
date_elements = event_element.find_all(
"p", class_=CLASS_DATE_EVENT)

date, date_text, date_parts = get_event_date(
date_elements=date_elements, event_name=event_name)

time = HORA_NO_DISPONIBLE
if date.__class__.__name__ == 'date':
if is_date_ok(date, forward_days):
# HORA
time = get_event_time(date_parts=date_parts, date_text=date_text)
else:
print(
f"---> Removing because date [{date}] is less than date + forward days: [{forward_days}]")
date = None
else:
date = FECHA_NO_DISPONIBLE
return date, time

def is_date_ok(date, forward_days):
'''Chequea que los dias del date sean mayor o igual a dias en adelante'''

try:
today = datetime.now().date()
delta = date - today

# print(f"===> Delta days: {delta.days}")
return delta.days >= forward_days

except TypeError:
print(f"---> La fecha no es date type: {date}")
return False

def get_date_from_weekday(input_date):
'''Calculamos la fecha al dia de la semana'''
days_of_week = {
0: "monday",
1: "tuesday",
2: "wednesday",
3: "thursday",
4: "friday",
5: "saturday",
6: "sunday",
}

current_date = datetime.now().date()
for i in range(6):
current_date = current_date + timedelta(days=1)
value = days_of_week[current_date.weekday()]
if input_date.lower() == value:
break
return current_date

def normalize_date(input_date):
'''Definir la función para normalizar fechas'''

# print(f"===> input_date: [{input_date}]")

if "today" in input_date.lower():
return datetime.now().date()

elif "tomorrow" in input_date.lower():
return (datetime.now() + timedelta(days=1)).date()

elif any(day in input_date.lower() for day in
["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):
return get_date_from_weekday(input_date=input_date)

try:
# Cambiar el formato de fecha para omitir las comillas
value = datetime.strptime(input_date, "%a, %b %d").date().replace(
year=datetime.now().year).strftime("%Y-%m-%d")
return datetime.strptime(value, "%Y-%m-%d").date()

except ValueError:
try:
# Intentar convertir a formato de fecha directamente
value = datetime.strptime(input_date, "%a, %d %b").date().replace(
year=datetime.now().year).strftime("%Y-%m-%d")

return datetime.strptime(value, "%Y-%m-%d").date()
# return datetime.strptime(input_date, "%Y-%m-%d").date()

except ValueError:
# Devuelve la entrada original si no coincide con ninguno de los casos anteriores
return input_date

def get_cat_desc_location(driver, event_url):
'''Retorna la categorias, descriccion y locaccion del evento'''
categoria = "n/a"
description = "No hay descriccion"
location = "No hay locaccion"

if "URL de evento no disponible" not in event_url:

# print(f"\n===> Event url: {event_url}")

driver.get(event_url)

# driver.implicitly_wait(10)

page_source = driver.page_source

WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, CLASS_SUMMARY)))

soup_evento = BeautifulSoup(page_source, 'html.parser')

summary_element = soup_evento.find("p", class_=CLASS_SUMMARY)

description = summary_element.text.strip(
) if summary_element else NO_HAY_DESCRICCION

location_element = soup_evento.find(
"p", class_=CLASS_LOCATION)

location = location_element.text.strip(
) if location_element else NO_HAY_LOCACCION

address_element = soup_evento.find(
"div", class_=CLASS_ADDRESS)

address = address_element.text.strip(
) if address_element else NO_HAY_DIRECCION

# Encontrar las categorias
tags_element = soup_evento.findAll("a", class_=CLASS_TAGS)

categorias = []

for tag in tags_element:
# print(f"tag: {tag.text}")
categorias.append(tag.text)

categoria = "|".join(categorias)

return categoria, description, location, address
def extract_lat_long_from_base_url(base_url):
'''Extrae la latitud y longitud de la URL base'''
latitude = "No hay latitud"
longitude = "No hay longitud"

try:
# Extraer la latitud y longitud de la URL base
latitude = base_url.split("latitude=")[1].split("&")[0]
longitude = base_url.split("longitude=")[1].split("&")[0]
except IndexError:
print("---> Error extrayendo latitud y longitud de la URL base.")

return latitude, longitude

def main():
# URL base de Eventbrite para eventos en Madrid

# base_url = BASE_URL
base_url = BASE_URL

# Inicializa el controlador web de Selenium
# Necesitas tener Chrome y chromedriver instalados
driver = webdriver.Chrome()

# options = Options()
# options.headless = False
# driver = webdriver.Firefox(options=options)

# Número de páginas a buscar
num_pages = NUMERO_DE_PAGINAS_A_BUSCAR

# Dias en adelanto
forward_days = DIAS_EN_ADELANTO

# Conjunto para evitar eventos duplicados
seen_events = set()

# Lista para almacenar los datos de los eventos
data = []

for page_number in range(1, num_pages + 1):

# Construye la URL de la página actual
url = f"{base_url}{page_number}"

print(f'\n\n===> url: {url}')

# Realiza la solicitud HTTP para obtener la página

driver.get(url)

# Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)
driver.implicitly_wait(10)

# WebDriverWait(driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

# Obtiene el contenido de la página después de que se haya ejecutado JavaScript
page_source = driver.page_source

# with open('output.txt', 'a') as f:
# f.write(page_source)

# Parsea la página con BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Encuentra todos los elementos que contienen detalles de eventos
event_elements = soup.find_all("div", class_=CLASS_EVENT_ELEMENT)

print(f"====> Number of rows found: {len(event_elements)}")

for event_element in event_elements:

# Event name
event_name_element = event_element.find(
"h2", class_=CLASS_EVENT_NAME)
event_name = event_name_element.text.strip(
) if event_name_element else TITULO_NO_DISPONIBLE

if TITULO_NO_DISPONIBLE in event_name:
continue

# Event date and time
date, time = get_event_date_time(
event_element=event_element, event_name=event_name, forward_days=forward_days)

if date is None:
continue

# IMAGEN
image_element = soup.find("img", {"class": "event-card-image"})

# Extracta la URL de la imagen
image_url = image_element['src'] if image_element else "URL de imagen no disponible"
# print(image_url)

# URL DEL EVENTO
url_element = event_element.find("a", class_=CLASS_URL_EVENT)

event_url = url_element['href'] if url_element else URL_EVENTO_NO_DISPONIBLE

# Extraer latitud y longitud de la URL base
event_base_url = event_url.split("?")[0]
event_latitude, event_longitude = extract_lat_long_from_base_url(event_base_url)

categoria, description, location, address = get_cat_desc_location(
driver, event_url)

# Verifica si el evento ya ha sido procesado
if event_name not in seen_events:
print(
f"===> Adding event name: [{event_name}] and date: [{date}]")

# Agrega los datos a la lista y al conjunto
data.append(
[event_name, image_url, event_url, location, address, date, time, description, categoria,
event_latitude, event_longitude])
seen_events.add(event_name)
else:
print(
f"----> Already added event name: [{event_name}] and date: [{date}]")

print("\n ---")

print(f"Seen events size: {len(seen_events)}")

print(f"Number of events: {len(data)}")

# Guardar datos en CSV

with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

writer = csv.writer(csvfile)

writer.writerow(["Name", "Image URL", "Event URL",
"Location", "Address", "Date", "Time", "Description", "Categoria"])

for event in data:
writer.writerow(event)

print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

# Cerrar el controlador de Selenium
driver.quit()

# Comenzando a processar
if __name__ == "__main__":
main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 07/12/2023 20:15:30

La csv tiene ahora 2 nuevas columnas longitude and latitute. El address debe funcionar bien ahora

Puedes poner los valores de longitude y lattitude aqui para chequear:

https://jsfiddle.net/gh/get/library/pure/googlemaps/js-samples/tree/master/dist/samples/advanced-markers-simple/jsfiddle

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.firefox.options import Options

NUMERO_DE_PAGINAS_A_BUSCAR = 1

DIAS_EN_ADELANTO = 4

BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events/?page="

# BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

FECHA_NO_DISPONIBLE = "Fecha no disponible"

HORA_NO_DISPONIBLE = "Hora no disponible"

TITULO_NO_DISPONIBLE = "Título no disponible"

URL_IMAGEN_NO_DISPONIBLE = "URL de imagen no disponible"

URL_EVENTO_NO_DISPONIBLE = "URL de evento no disponible"

NO_HAY_DESCRICCION = "No hay descricion"

NO_HAY_LOCACCION = "No hay locaccion"

NO_HAY_DIRECCION = "No hay direccion"

CLASS_EVENT_NAME = "Typography_root__4bejd"

CLASS_URL_EVENT = "event-card-link"

CLASS_DATE_EVENT = "Typography_root__4bejd"

CLASS_SUMMARY = "summary"

CLASS_TAGS = "tags-link"

CLASS_EVENT_ELEMENT = "Stack_root__1ksk7"

CLASS_LOCATION = "location-info__address-text"

CLASS_ADDRESS = "location-info__address"

def get_event_time(date_parts, date_text):

    '''Retorna la hora del evento'''

    time = date_parts[1].strip() if date_parts else date_text

    # Convertir a formato de 24 horas y manejar la zona horaria

    try:

        # Utilizar dateutil.parser para manejar diferentes formatos de hora y zona horaria

        parsed_time = parser.parse(time)

        # Convertir la hora a formato de 24 horas

        time = parsed_time.strftime("%H:%M")

    except ValueError:

        # En caso de error al convertir, dejar el valor original

        print(f"-- Error parsing [{time}]")

        time = "00:00"

    return time

def get_event_date(date_elements, event_name):

    '''Retorna la fecha del evento'''

    date = None

    date_text = None

    date_parts = None

    for delement in date_elements:

        date_text = delement.text.strip() if delement else FECHA_NO_DISPONIBLE

        print(

            f"-----+++> event name: [{event_name}] and Date_text: [{date_text}]")

        date_parts = date_text.split('•')

        date = normalize_date(date_parts[0].strip(

        )) if date_parts else normalize_date(date_text)

        if date.__class__.__name__ == 'date':

            break

    return date, date_text, date_parts

def get_event_date_time(event_element, event_name, forward_days):

    '''Retorna el evento fecha y tiempo'''

    date_elements = event_element.find_all(

        "p", class_=CLASS_DATE_EVENT)

    date, date_text, date_parts = get_event_date(

        date_elements=date_elements, event_name=event_name)

    time = HORA_NO_DISPONIBLE

    if date.__class__.__name__ == 'date':

        if is_date_ok(date, forward_days):

            # HORA

            time = get_event_time(date_parts=date_parts, date_text=date_text)

        else:

            print(

                f"---> Removing because date [{date}] is less than date + forward days: [{forward_days}]")

            date = None

    else:

        date = FECHA_NO_DISPONIBLE

    return date, time

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        # print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"---> La fecha no es date type: {date}")

        return False

def get_date_from_weekday(input_date):

    '''Calculamos la fecha al dia de la semana'''

    days_of_week = {

        0: "monday",

        1: "tuesday",

        2: "wednesday",

        3: "thursday",

        4: "friday",

        5: "saturday",

        6: "sunday",

    current_date = datetime.now().date()

    for i in range(6):

        current_date = current_date + timedelta(days=1)

        value = days_of_week[current_date.weekday()]

        if input_date.lower() == value:

            break

    return current_date

def normalize_date(input_date):

    '''Definir la función para normalizar fechas'''

    # print(f"===> input_date: [{input_date}]")

    if "today" in input_date.lower():

        return datetime.now().date()

    elif "tomorrow" in input_date.lower():

        return (datetime.now() + timedelta(days=1)).date()

    elif any(day in input_date.lower() for day in ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]):

        return get_date_from_weekday(input_date=input_date)

    try:

        # Cambiar el formato de fecha para omitir las comillas

        value = datetime.strptime(input_date, "%a, %b %d").date().replace(

            year=datetime.now().year).strftime("%Y-%m-%d")

        return datetime.strptime(value, "%Y-%m-%d").date()

    except ValueError:

        try:

            # Intentar convertir a formato de fecha directamente

            value = datetime.strptime(input_date, "%a, %d %b").date().replace(

                year=datetime.now().year).strftime("%Y-%m-%d")

            return datetime.strptime(value, "%Y-%m-%d").date()

            # return datetime.strptime(input_date, "%Y-%m-%d").date()

        except ValueError:

            # Devuelve la entrada original si no coincide con ninguno de los casos anteriores

            return input_date

def get_event_values(driver, event_url):

    '''Retorna la categorias, descriccion y locaccion del evento'''

    categoria = "n/a"

    description = "No hay descriccion"

    location = "No hay locaccion"

    if "URL de evento no disponible" not in event_url:

        # print(f"\n===> Event url: {event_url}")

        driver.get(event_url)

        # driver.implicitly_wait(10)

        page_source = driver.page_source

        WebDriverWait(driver, 10).until(

            EC.presence_of_element_located((By.CLASS_NAME, CLASS_SUMMARY)))

        soup_evento = BeautifulSoup(page_source, 'html.parser')

        summary_element = soup_evento.find("p", class_=CLASS_SUMMARY)

        description = summary_element.text.strip(

        ) if summary_element else NO_HAY_DESCRICCION

        location_element = soup_evento.find(

            "p", class_=CLASS_LOCATION)

        location = location_element.text.strip(

        ) if location_element else NO_HAY_LOCACCION

        address_element = soup_evento.find(

            "div", class_=CLASS_ADDRESS)

        address = address_element.contents[1].strip(

        ) if address_element else NO_HAY_DIRECCION

        # Encontrar las categorias

        tags_element = soup_evento.findAll("a", class_=CLASS_TAGS)

        categorias = []

        for tag in tags_element:

            # print(f"tag: {tag.text}")

            categorias.append(tag.text)

        categoria = "|".join(categorias)

        longitude_meta = soup_evento.find(

            "meta", property="event:location:longitude")

        longitude = longitude_meta["content"] if longitude_meta else "No meta longitude given"

        latitude_meta = soup_evento.find(

            "meta", property="event:location:latitude")

        latitude = latitude_meta["content"] if latitude_meta else "No meta latitude given"

        # IMAGEN

        image_meta = soup_evento.find(

            "meta", property="og:image")

        # Extracta la URL de la imagen

        image_url = image_meta["content"] if image_meta else URL_IMAGEN_NO_DISPONIBLE

        # print(image_url)

    return categoria, description, location, address, longitude, latitude, image_url

def main():

    # URL base de Eventbrite para eventos en Madrid

    # base_url = BASE_URL

    base_url = BASE_URL

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # options = Options()

    # options.headless = True

    # driver = webdriver.Firefox(options=options)

    # Número de páginas a buscar

    num_pages = NUMERO_DE_PAGINAS_A_BUSCAR

    # Dias en adelanto

    forward_days = DIAS_EN_ADELANTO

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        driver.implicitly_wait(10)

        # WebDriverWait(driver, 10).until(

        #     EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # with open('output.txt', 'a') as f:

        #     f.write(page_source)

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_=CLASS_EVENT_ELEMENT)

        print(f"====> Number of rows found: {len(event_elements)}")

        for event_element in event_elements:

            # Event name

            event_name_element = event_element.find(

                "h2", class_=CLASS_EVENT_NAME)

            event_name = event_name_element.text.strip(

            ) if event_name_element else TITULO_NO_DISPONIBLE

            if TITULO_NO_DISPONIBLE in event_name:

                continue

            # Event date and time

            date, time = get_event_date_time(

                event_element=event_element, event_name=event_name, forward_days=forward_days)

            if date is None:

                continue

            # URL DEL EVENTO

            url_element = event_element.find("a", class_=CLASS_URL_EVENT)

            event_url = url_element['href'] if url_element else URL_EVENTO_NO_DISPONIBLE

            categoria, description, location, address, longitude, latitude, image_url = get_event_values(

                driver, event_url)

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events:

                print(

                    f"===> Adding event name: [{event_name}] and date: [{date}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, address, date, time, description, categoria, longitude, latitude])

                seen_events.add(event_name)

            else:

                print(

                    f"----> Already added event name: [{event_name}] and date: [{date}]")

            print("\n ---")

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Address", "Date", "Time", "Description", "Categoria", "Longitude", "Latitude"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

if __name__ == "__main__":

    main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 11/12/2023 09:06:22

Muchísimas gracias por tu ayuda!

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 15/12/2023 10:11:00

Hola de nuevo!!!!

Cuando filtro por la condición:

NUMERO_DE_PAGINAS_A_BUSCAR = 50
DIAS_EN_ADELANTO = 5

Solo me devuelve 7 eventos y yo creo que es porque eventbrite no tiene AÑOS entonces el script se piensa que Enero es Enero 2023 no del 2024

===> Today: 2023-12-15
===> Date: 2023-01-08
===> Delta days: -341

No sé como solucionarlo

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por Costero (93 intervenciones) el 15/12/2023 19:05:51

Si tienes razon. Cambie el codigo para que ahora tome la fecha y tiempo de la pagina del evento. Asi que no mas tomorrow, today, etc.

Salu2

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

from selenium import webdriver

from bs4 import BeautifulSoup

import csv

from dateutil import parser

from datetime import datetime, timedelta

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.firefox.options import Options

from selenium.common.exceptions import TimeoutException

TIEMPO_A_ESPERAR_PAGE = 20

NUMERO_DE_PAGINAS_A_BUSCAR = 6

DIAS_EN_ADELANTO = 25

BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events/?page="

# BASE_URL = "https://www.eventbrite.com/d/spain--madrid/free--events--next-month/?page="

FECHA_NO_DISPONIBLE = "Fecha no disponible"

HORA_NO_DISPONIBLE = "Hora no disponible"

TITULO_NO_DISPONIBLE = "Título no disponible"

URL_IMAGEN_NO_DISPONIBLE = "URL de imagen no disponible"

URL_EVENTO_NO_DISPONIBLE = "URL de evento no disponible"

NO_HAY_DESCRICCION = "No hay descricion"

NO_HAY_LOCACCION = "No hay locaccion"

NO_HAY_DIRECCION = "No hay direccion"

CLASS_EVENT_NAME = "Typography_root__4bejd"

CLASS_URL_EVENT = "event-card-link"

CLASS_DATE_EVENT = "Typography_root__4bejd"

ID_ROOT = "root"

CLASS_TAGS = "tags-link"

CLASS_EVENT_ELEMENT = "Stack_root__1ksk7"

CLASS_LOCATION = "location-info__address-text"

CLASS_ADDRESS = "location-info__address"

def is_date_ok(date, forward_days):

    '''Chequea que los dias del date sean mayor o igual a dias en adelante'''

    try:

        today = datetime.now().date()

        delta = date - today

        print(f"===> Delta days: {delta.days}")

        return delta.days >= forward_days

    except TypeError:

        print(f"La date no es date type: {date}")

        return False

def get_event_values(driver, event_url):

    '''Retorna la categorias, descriccion, locaccion, longitude, latitude, fecha, tiempo'''

    categoria = "n/a"

    description = "No hay descriccion"

    location = "No hay locaccion"

    date = FECHA_NO_DISPONIBLE

    time = FECHA_NO_DISPONIBLE

    address = NO_HAY_DIRECCION

    image_url = URL_IMAGEN_NO_DISPONIBLE

    longitude = "No meta longitude given"

    latitude = "No meta latitude given"

    if "URL de evento no disponible" not in event_url:

        print(f"\n===> Event url: {event_url}")

        driver.get(event_url)

        # driver.implicitly_wait(10)

        page_source = driver.page_source

        try:

            WebDriverWait(driver, TIEMPO_A_ESPERAR_PAGE).until(

                EC.presence_of_element_located((By.ID, ID_ROOT)))

        except TimeoutException:

            print(

                f"---> Element [{ID_ROOT}] was not found or took too long")

            return categoria, description, location, address, longitude, latitude, image_url, date, time

        soup_evento = BeautifulSoup(page_source, 'html.parser')

        summary_element = soup_evento.find("p", class_=ID_ROOT)

        description = summary_element.text.strip(

        ) if summary_element else NO_HAY_DESCRICCION

        location_element = soup_evento.find(

            "p", class_=CLASS_LOCATION)

        location = location_element.text.strip(

        ) if location_element else NO_HAY_LOCACCION

        address_element = soup_evento.find(

            "div", class_=CLASS_ADDRESS)

        try:

            address = address_element.contents[1].strip(

            ) if address_element else NO_HAY_DIRECCION

        except TypeError:

            print(f"---> TypeError, Address element: {address_element}")

        # Encontrar las categorias

        tags_element = soup_evento.findAll("a", class_=CLASS_TAGS)

        categorias = []

        for tag in tags_element:

            # print(f"tag: {tag.text}")

            categorias.append(tag.text)

        categoria = "|".join(categorias)

        longitude_meta = soup_evento.find(

            "meta", property="event:location:longitude")

        longitude = longitude_meta["content"] if longitude_meta else "No meta longitude given"

        latitude_meta = soup_evento.find(

            "meta", property="event:location:latitude")

        latitude = latitude_meta["content"] if latitude_meta else "No meta latitude given"

        starttime_meta = soup_evento.find(

            "meta", property="event:start_time")

        starttime = starttime_meta["content"] if starttime_meta else FECHA_NO_DISPONIBLE

        if FECHA_NO_DISPONIBLE not in starttime:

            datetime_object = datetime.strptime(

                starttime.rsplit(':', 2)[0], '%Y-%m-%dT%H:%M')

            date = datetime_object.date()

            time = datetime_object.time()

        # IMAGEN

        image_meta = soup_evento.find(

            "meta", property="og:image")

        # Extracta la URL de la imagen

        image_url = image_meta["content"] if image_meta else URL_IMAGEN_NO_DISPONIBLE

        # print(image_url)

    return categoria, description, location, address, longitude, latitude, image_url, date, time

def main():

    # URL base de Eventbrite para eventos en Madrid

    # base_url = BASE_URL

    base_url = BASE_URL

    # Inicializa el controlador web de Selenium

    # Necesitas tener Chrome y chromedriver instalados

    driver = webdriver.Chrome()

    # options = Options()

    # options.headless = True

    # driver = webdriver.Firefox(options=options)

    # Número de páginas a buscar

    num_pages = NUMERO_DE_PAGINAS_A_BUSCAR

    # Dias en adelanto

    forward_days = DIAS_EN_ADELANTO

    # Conjunto para evitar eventos duplicados

    seen_events = set()

    # Lista para almacenar los datos de los eventos

    data = []

    for page_number in range(1, num_pages + 1):

        # Construye la URL de la página actual

        url = f"{base_url}{page_number}"

        print(f'\n\n===> url: {url}')

        # Realiza la solicitud HTTP para obtener la página

        driver.get(url)

        # Espera a que la página se cargue completamente (puedes ajustar el tiempo según sea necesario)

        # driver.implicitly_wait(TIEMPO_A_ESPERAR_PAGE)

        WebDriverWait(driver, TIEMPO_A_ESPERAR_PAGE).until(

            EC.presence_of_element_located((By.CLASS_NAME, "Stack_root__1ksk7")))

        # Obtiene el contenido de la página después de que se haya ejecutado JavaScript

        page_source = driver.page_source

        # with open('output.txt', 'a') as f:

        #     f.write(page_source)

        # Parsea la página con BeautifulSoup

        soup = BeautifulSoup(page_source, 'html.parser')

        # Encuentra todos los elementos que contienen detalles de eventos

        event_elements = soup.find_all("div", class_=CLASS_EVENT_ELEMENT)

        print(f"====> Number of rows found: {len(event_elements)}")

        for event_element in event_elements:

            # Event name

            event_name_element = event_element.find(

                "h2", class_=CLASS_EVENT_NAME)

            event_name = event_name_element.text.strip(

            ) if event_name_element else TITULO_NO_DISPONIBLE

            if TITULO_NO_DISPONIBLE in event_name:

                continue

            # URL DEL EVENTO

            url_element = event_element.find("a", class_=CLASS_URL_EVENT)

            event_url = url_element['href'] if url_element else URL_EVENTO_NO_DISPONIBLE

            categoria, description, location, address, longitude, latitude, image_url, date, time = get_event_values(

                driver, event_url)

            # Verifica si el evento ya ha sido procesado

            if event_name not in seen_events and is_date_ok(date, forward_days):

                print(

                    f"===> Adding event name: [{event_name}] and date: [{date}]")

                # Agrega los datos a la lista y al conjunto

                data.append(

                    [event_name, image_url, event_url, location, address, date, time, description, categoria, longitude, latitude])

                seen_events.add(event_name)

            else:

                print(

                    "----> Already added or date less than forward days "

                    f"{DIAS_EN_ADELANTO} event name: [{event_name}] and date: [{date}]")

            print("\n ---")

    print(f"Seen events size: {len(seen_events)}")

    print(f"Number of events: {len(data)}")

    # Guardar datos en CSV

    with open('eventbrite_main_info.csv', 'w', newline='', encoding='utf-8') as csvfile:

        writer = csv.writer(csvfile)

        writer.writerow(["Name", "Image URL", "Event URL",

                        "Location", "Address", "Date", "Time", "Description", "Categoria", "Longitude", "Latitude"])

        for event in data:

            writer.writerow(event)

    print("Scraping completado. Los datos han sido guardados en eventbrite_main_info.csv.")

    # Cerrar el controlador de Selenium

    driver.quit()

# Comenzando a processar

if __name__ == "__main__":

    main()

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 19/12/2023 09:02:35

Muchas gracias de nuevo!!!!

Valora esta respuesta

Comentar

Web Scrapping EventBrite

Publicado por mariona (17 intervenciones) el 19/12/2023 10:36:27

Tú sabrías como ayudarme?
Tenía este código para el scrappeo de MeetUp (otra web de ventos) que me funcionaba, pero ahora no me funciona.
No sé si es porque el sistema de páginación (scroll infinito) ha cambiado o porqué pero ya no me funciona y antes sí.
Pretendo scrappear los eventos de madrid como en el anteiror código y con mismas condiciones, cogiendo eventos a partir de 15 días en adelante.

from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import os
import time # Import the time module
import random

# URL of the page with meetup events in Madrid
base_url = "https://www.meetup.com/es-ES/find/?location=es--Madrid&source=EVENTS&eventType=inPerson"

# Limit of events you want to collect
event_limit = 100

# Directory to store the event images
image_dir = "event_images"
os.makedirs(image_dir, exist_ok=True)

# List to store event data
data = []

# Initialize a Selenium web driver
driver = webdriver.Chrome() # You need to have Chrome and chromedriver installed

# Function to get event details including the image
def get_event_details(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Introduce a 0.5-second delay before retrieving the map link
time.sleep(1 + random.uniform(0, 1))

# Find the HTML element containing the event image from within the event page
image_element = soup.find("img", {"alt": True, "data-nimg": True})

# Extract the image URL
image_url = image_element['src'] if image_element else "Image URL not available"

description_element = soup.find("div", {"class": "break-words"})
description = description_element.find_all("p")
description_text = "\n".join([p.text.strip() for p in description])

location_name_element = soup.select_one("a[data-testid='venue-name-link']").text if soup.select_one(
"a[data-testid='venue-name-link']") else "Information not available"

location_info_element = soup.find("div", {"class": "text-gray6", "data-testid": "location-info"})
location_info = location_info_element.text.strip() if location_info_element else "Information not available"

map_link_element = soup.find("a", {"data-testid": "map-link"})['href'] if soup.find("a", {
"data-testid": "map-link"}) else "Map link not available"

categories = get_event_categories(event_url)

return image_url, description_text, location_name_element, location_info, map_link_element, categories

# Function to get event categories
def get_event_categories(event_url):
driver.get(event_url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

categories_element = soup.find("div", {"id": "topics"})

if categories_element:
categories = categories_element.find_all("a")
categories_list = [category.text for category in categories]
categories_text = ", ".join(categories_list)
else:
categories_text = "Categories not available"

return categories_text

# Counter for the number of collected events
event_count = 0
# Conjunto para almacenar URL de eventos ya recopiladas
collected_event_urls = set()
# Current page number
page_number = 1

while event_count < event_limit:
# Build the URL of the current page
url = f"{base_url}&page={page_number}"

# Send an HTTP request to get the page
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Find all elements containing event details
event_elements = soup.find_all("div", {"data-element-name": "categoryResults-eventCard"})

if not event_elements:
break

for event_element in event_elements:
# Find the image URL, event name, and other details
event_url_element = event_element.find("a", {"data-event-label": "Event card"})
event_url = event_url_element["href"]

# Verifica si ya hemos recopilado este evento
if event_url in collected_event_urls:
continue

image_url, description_text, location_name, location_info, map_link, event_categories = get_event_details(event_url)

event_name = event_element.find('h2', class_="text-gray7 font-medium text-base pb-1 pt-0 line-clamp-3").text.strip()
event_date = event_element.find("span").text.strip()
event_time_element = event_element.find("time")
if event_time_element:
event_time = event_time_element.find_all("span")[1].text.strip()
else:
event_time = "Time not available"
event_group = event_element.find("p", class_="text-gray6").text.strip()
# Añade la URL del evento al conjunto de eventos recopilados
collected_event_urls.add(event_url)
data.append([image_url, event_name, event_date, event_time, event_group, description_text, location_name, location_info, map_link, event_categories])

event_count += 1

if event_count >= event_limit:
break

page_number += 1

# Save the data to a CSV file
with open('meetup_events_madrid.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["Image URL", "Name", "Date", "Time", "Group", "Description", "Location Name", "Location Info", "Map Link", "Categories"])
for event in data:
writer.writerow(event)

Valora esta respuesta

Comentar

Ayuda con Trabajo para la Universidad Código Python

SCRIPS DESHABILITADOS