schema_cropyield = StructType([
StructField("Crop", StringType(), True),
StructField("Region", StringType(), True),
StructField("Soil_Type", StringType(), True),
StructField("Soil_pH", DoubleType(), True),
StructField("Rainfall_mm", DoubleType(), True),
StructField("Temperature_C", DoubleType(), True),
StructField("Humidity_pct", DoubleType(), True),
StructField("Fertilizer_Used_kg", DoubleType(), True),
StructField("Irrigation", StringType(), True),
StructField("Pesticides_Used_kg", DoubleType(), True),
StructField("Planting_Density", DoubleType(), True),
StructField("Previous_Crop", StringType(), True),
StructField("Yield_ton_per_ha", DoubleType(), True),
])
df_sel = ( spark.read
.format("csv")
.schema(schema_cropyield)
.option("header", "true")
.load("crop_yield_dataset.csv")
.select("Crop", "Region", "Temperature_C", "Rainfall_mm", "Irrigation", "Yield_ton_per_ha")
)
o
df_sel = (df_crop
.select("Crop", "Region", "Temperature_C", "Rainfall_mm", "Irrigation", "Yield_ton_per_ha")
)
+------+--------+-------------+-----------+----------+----------------+
| Crop| Region|Temperature_C|Rainfall_mm|Irrigation|Yield_ton_per_ha|
+------+--------+-------------+-----------+----------+----------------+
| Maize|Region_C| 19.7| 1485.4| Drip| 101.48|
|Barley|Region_D| 29.1| 399.4| Sprinkler| 127.39|
| Rice|Region_C| 30.5| 980.9| Sprinkler| 68.99|
| Maize|Region_D| 26.4| 1054.3| Drip| 169.06|
| Maize|Region_D| 20.4| 744.6| Drip| 118.71|
+------+--------+-------------+-----------+----------+----------------+
df_renamed = (df_sel
.withColumnRenamed("Temperature_C", "Temperatura")
.withColumnRenamed("Rainfall_mm", "Lluvia")
.withColumnRenamed("Yield_ton_per_ha", "Rendimiento")
)
df_renamed.show(5)
+------+--------+-----------+------+----------+-----------+
| Crop| Region|Temperatura|Lluvia|Irrigation|Rendimiento|
+------+--------+-----------+------+----------+-----------+
| Maize|Region_C| 19.7|1485.4| Drip| 101.48|
|Barley|Region_D| 29.1| 399.4| Sprinkler| 127.39|
| Rice|Region_C| 30.5| 980.9| Sprinkler| 68.99|
| Maize|Region_D| 26.4|1054.3| Drip| 169.06|
| Maize|Region_D| 20.4| 744.6| Drip| 118.71|
+------+--------+-----------+------+----------+-----------+
df_renamed_filter = (df_renamed
.filter(
(col("Crop") == "Maize") & (col("Temperatura") > 25)
)
)
df_renamed_filter.show()
+-----+--------+-----------+------+----------+-----------+
| Crop| Region|Temperatura|Lluvia|Irrigation|Rendimiento|
+-----+--------+-----------+------+----------+-----------+
|Maize|Region_D| 26.4|1054.3| Drip| 169.06|
|Maize|Region_C| 32.4| 846.1| None| 162.2|
|Maize|Region_A| 26.6| 362.5| Sprinkler| 95.23|
|Maize|Region_C| 33.7|1193.3| None| 110.57|
|Maize|Region_C| 27.8| 695.2| Flood| 143.84|
|Maize|Region_D| 30.2|1001.4| Flood| 138.61|
|Maize|Region_A| 27.7| 747.7| Sprinkler| 114.58|
|Maize|Region_B| 28.9|1392.9| Drip| 169.23|
|Maize|Region_B| 34.7| 694.4| Drip| 96.08|
|Maize|Region_D| 29.5| 848.8| Flood| 93.45|
|Maize|Region_D| 32.8|1067.7| Flood| 154.6|
|Maize|Region_A| 28.5| 406.1| Sprinkler| 55.26|
|Maize|Region_D| 26.0| 391.4| Sprinkler| 100.34|
|Maize|Region_C| 25.9|1444.8| Sprinkler| 135.8|
|Maize|Region_D| 27.8| 823.3| Sprinkler| 161.48|
|Maize|Region_D| 28.7| 955.8| Flood| 91.4|
|Maize|Region_A| 33.2| 248.4| None| 149.49|
|Maize|Region_B| 34.3| 410.4| Flood| 37.78|
|Maize|Region_A| 27.1| 763.9| Drip| 110.63|
|Maize|Region_C| 28.8|1215.0| Flood| 127.89|
+-----+--------+-----------+------+----------+-----------+
df = (df_crop
.select("Crop", "Region", "Temperature_C", "Rainfall_mm", "Irrigation", "Yield_ton_per_ha")
.withColumnRenamed("Temperature_C", "Temperatura")
.withColumnRenamed("Rainfall_mm", "Lluvia")
.withColumnRenamed("Yield_ton_per_ha", "Rendimiento")
.filter(
(col("Crop") == "Maize") & (col("Temperatura") > 25)
)
)
df.show()
+-----+--------+-----------+------+----------+-----------+
| Crop| Region|Temperatura|Lluvia|Irrigation|Rendimiento|
+-----+--------+-----------+------+----------+-----------+
|Maize|Region_D| 26.4|1054.3| Drip| 169.06|
|Maize|Region_C| 32.4| 846.1| None| 162.2|
|Maize|Region_A| 26.6| 362.5| Sprinkler| 95.23|
|Maize|Region_C| 33.7|1193.3| None| 110.57|
|Maize|Region_C| 27.8| 695.2| Flood| 143.84|
+-----+--------+-----------+------+----------+-----------+
df_base = (df_worldfamous
.select("Place_Name", "Country", "UNESCO_World_Heritage", "Entry_Fee_USD", "Annual_Visitors_Millions")
)
+-------------------+-------------+---------------------+-------------+------------------------+
| Place_Name| Country|UNESCO_World_Heritage|Entry_Fee_USD|Annual_Visitors_Millions|
+-------------------+-------------+---------------------+-------------+------------------------+
| Eiffel Tower| France| No| 35| 7.0|
| Times Square|United States| No| 0| 50.0|
| Louvre Museum| France| Yes| 22| 8.7|
|Great Wall of China| China| Yes| 10| 10.0|
| Taj Mahal| India| Yes| 15| 7.5|
+-------------------+-------------+---------------------+-------------+------------------------+
df_es = (df_base
.withColumnRenamed("Place_Name", "Lugar")
.withColumnRenamed("UNESCO_World_Heritage", "Es_UNESCO")
.withColumnRenamed("Entry_Fee_USD", "Precio_Entrada")
.withColumnRenamed("Annual_Visitors_Millions", "Visitantes_Millones")
)
+-------------------+-------------+---------+--------------+-------------------+
| Lugar| Country|Es_UNESCO|Precio_Entrada|Visitantes_Millones|
+-------------------+-------------+---------+--------------+-------------------+
| Eiffel Tower| France| No| 35| 7.0|
| Times Square|United States| No| 0| 50.0|
| Louvre Museum| France| Yes| 22| 8.7|
|Great Wall of China| China| Yes| 10| 10.0|
| Taj Mahal| India| Yes| 15| 7.5|
+-------------------+-------------+---------+--------------+-------------------+
df_es = (df_es
.filter(
(col("Es_UNESCO") == "Yes") & (col("Precio_Entrada") <= 20)
)
)
+--------------------+--------------+---------+--------------+-------------------+
| Lugar| Country|Es_UNESCO|Precio_Entrada|Visitantes_Millones|
+--------------------+--------------+---------+--------------+-------------------+
| Great Wall of China| China| Yes| 10| 10.0|
| Taj Mahal| India| Yes| 15| 7.5|
| Colosseum| Italy| Yes| 18| 7.65|
| Forbidden City| China| Yes| 8| 9.0|
|Notre-Dame Cathedral| France| Yes| 0| 13.0|
|Great Pyramid of ...| Egypt| Yes| 20| 2.8|
|Leaning Tower of ...| Italy| Yes| 20| 5.0|
| Acropolis| Greece| Yes| 13| 4.0|
| Big Ben|United Kingdom| Yes| 0| 5.5|
+--------------------+--------------+---------+--------------+-------------------+
df_contactos = (df_turismo
.select("Nombre", "Tipo", "Provincia", "web", "Email")
)
+--------------------+--------------------+---------+--------------------+--------------------+
| Nombre| Tipo|Provincia| web| Email|
+--------------------+--------------------+---------+--------------------+--------------------+
|BERNARDO MORO MEN...|Profesional de Tu...| Asturias| NULL|bernardomoro@hotm...|
| LA SASTRERÍA|Casa Rural de Alq...| Ávila|www.lasastreriade...| NULL|
| LAS HAZANAS|Casa Rural de Alq...| Ávila| NULL|lashazanas@hotmai...|
| LA CASITA DEL PAJAR|Casa Rural de Alq...| Ávila| NULL|lashazanas@hotmai...|
| MARACANA| Bar| Ávila| NULL|emo123anatoliev@g...|
+--------------------+--------------------+---------+--------------------+--------------------+
df_limpio = (df_contactos
.withColumnRenamed("Nombre", "nombre_establecimiento")
.withColumnRenamed("Tipo", "categoria_actividad")
.withColumnRenamed("web", "sitio_web")
.withColumnRenamed("Email", "correo_electronico")
)
+----------------------+--------------------+---------+--------------------+--------------------+
|nombre_establecimiento| categoria_actividad|Provincia| sitio_web| correo_electronico|
+----------------------+--------------------+---------+--------------------+--------------------+
| BERNARDO MORO MEN...|Profesional de Tu...| Asturias| NULL|bernardomoro@hotm...|
| LA SASTRERÍA|Casa Rural de Alq...| Ávila|www.lasastreriade...| NULL|
| LAS HAZANAS|Casa Rural de Alq...| Ávila| NULL|lashazanas@hotmai...|
| LA CASITA DEL PAJAR|Casa Rural de Alq...| Ávila| NULL|lashazanas@hotmai...|
| MARACANA| Bar| Ávila| NULL|emo123anatoliev@g...|
+----------------------+--------------------+---------+--------------------+--------------------+
df_final = (df_limpio
.filter(
(col("Provincia") == "Burgos") &
(col("categoria_actividad").like("%Bodega%")) &
(col("sitio_web").isNotNull())
)
)
+----------------------+--------------------+---------+--------------------+--------------------+
|nombre_establecimiento| categoria_actividad|Provincia| sitio_web| correo_electronico|
+----------------------+--------------------+---------+--------------------+--------------------+
| BODEGAS TARSUS|g - Bodegas y los...| Burgos| www.tarsusvino.com| NULL|
| BODEGAS DOMINIO D...|g - Bodegas y los...| Burgos|www.dominiodecair...|bodegas@dominiode...|
| TERRITORIO LUTHIER|g - Bodegas y los...| Burgos|territorioluthier...|luthier@territori...|
| BODEGA COVARRUBIAS|g - Bodegas y los...| Burgos| http://valdable.com| info@valdable.com|
| BODEGAS PASCUAL, ...|g - Bodegas y los...| Burgos|222.bodegaspascua...|export@bodegaspas...|
+----------------------+--------------------+---------+--------------------+--------------------+