Cambios hechos en clase

joanby · Nov 22, 2018 · 37eeb4b · 37eeb4b
1 parent 571bbff
commit 37eeb4b
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 24 deletions.
diff --git a/scripts/Hola b/scripts/Hola
@@ -0,0 +1,23 @@
+"V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9"
+"1" 1.06 9.2 151 54.4 1.6 9077 0 0.628 "Arizona"
+"2" 0.89 10.3 202 57.9 2.2 5088 25.3 1.555 "Boston"
+"3" 1.43 15.4 113 53 3.4 9212 0 1.058 "Central"
+"4" 1.02 11.2 168 56 0.3 6423 34.3 0.7 "Common"
+"5" 1.49 8.8 192 51.2 1 3300 15.6 2.044 "Consolid"
+"6" 1.32 13.5 111 60 -2.2 11127 22.5 1.241 "Florida"
+"7" 1.22 12.2 175 67.6 2.2 7642 0 1.652 "Hawaiian"
+"8" 1.1 9.2 245 57 3.3 13082 0 0.309 "Idaho"
+"9" 1.34 13 168 60.4 7.2 8406 0 0.862 "Kentucky"
+"10" 1.12 12.4 197 53 2.7 6455 39.2 0.623 "Madison"
+"11" 0.75 7.5 173 51.5 6.5 17441 0 0.768 "Nevada"
+"12" 1.13 10.9 178 62 3.7 6154 0 1.897 "NewEngla"
+"13" 1.15 12.7 199 53.7 6.4 7179 50.2 0.527 "Northern"
+"14" 1.09 12 96 49.8 1.4 9673 0 0.588 "Oklahoma"
+"15" 0.96 7.6 164 62.2 -0.1 6468 0.9 1.4 "Pacific"
+"16" 1.16 9.9 252 56 9.2 15991 0 0.62 "Puget"
+"17" 0.76 6.4 136 61.9 9 5714 8.3 1.92 "SanDiego"
+"18" 1.05 12.6 150 56.7 2.7 10140 0 1.108 "Southern"
+"19" 1.16 11.7 104 54 -2.1 13507 0 0.636 "Texas"
+"20" 1.2 11.8 148 59.9 3.5 7287 41.1 0.702 "Wisconsi"
+"21" 1.04 8.6 204 61 3.5 6650 0 2.116 "United"
+"22" 1.07 9.3 174 54.3 5.9 10093 26.6 1.306 "Antonio"
diff --git a/scripts/tema1/01-data_visualization.R b/scripts/tema1/01-data_visualization.R
@@ -1,5 +1,6 @@
 #Data Visualization - 11 de Mayo de 2018
 library(tidyverse)
+
 #tidyverse 1.2.1 ──
 #✔ ggplot2 2.2.1     ✔ purrr   0.2.4
 #✔ tibble  1.4.2     ✔ dplyr   0.7.4
@@ -16,13 +17,23 @@ View(mpg)
 # displ: tamaño del motor del coche en litros
 # hwy: número de millas recorridas en autopista por galón de combustible (3.785411784 litros)
 
+ggplot(data = mpg)
+
+mpg %>% ggplot()
+
+
 ggplot(data = mpg) + 
   geom_point(mapping = aes(x = displ, y = hwy))
 
 #PLANTILLA PARA HACER UNA REPRESENTACIÓN GRÁFICA CON GGPLOT
 #ggplot(data = <DATA_FRAME>) +
 #  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
 
+ggplot(data = mpg) +
+  geom_point(mapping = aes(x = class, y = drv))
+
+
+
 #Color de los puntos
 ggplot(data = mpg) +
   geom_point(mapping = aes(x = displ, y = hwy, color = class))
@@ -54,15 +65,18 @@ ggplot() +
   scale_y_continuous(name="") +
   scale_x_continuous(name="") +
   scale_shape_identity() +
-  geom_point(data=d, mapping=aes(x=p%%16, y=p%/%16, shape=p), size=5, fill="red") +
+  geom_point(data=d, mapping=aes(x=p%%16, y=p%/%16, shape=p), size=5, fill="yellow") +
   geom_text(data=d, mapping=aes(x=p%%16, y=p%/%16+0.25, label=p), size=3)
 
 
 
 ggplot(data = mpg) + 
   geom_point(mapping = aes(x = displ, y = hwy), 
              shape = 23, size = 10, color = "red", 
-             fill = "yellow")
+             fill = 'yellow')
+
+ggplot(data = mpg) + 
+  geom_point(mapping = aes(x=displ, y = hwy, color = displ<5))
 
 
 ##FACETS

diff --git a/scripts/tema1/02-data_transform.R b/scripts/tema1/02-data_transform.R
@@ -2,9 +2,14 @@ library(tidyverse)
 library(nycflights13)
 
 
-nycflights13::flights
+tt<-nycflights13::flights
 ?flights
 View(flights)
+?tibble
+
+head(flights)
+tail(flights)
+
 #tibble es un data frame mejorado para tidyverse
 ## * int -> números enteros
 ## * dbl -> números reales (double)
@@ -28,6 +33,9 @@ View(flights)
 
 ### FILTER
 jan1 <- filter(flights, month == 1, day == 1)
+flights %>% 
+  filter(month == 1, day == 1) %>%
+  filter(dep_delay>0)
 
 may19 <- filter(flights, month == 5, day == 19)
 
@@ -39,7 +47,9 @@ filter(flights, month == 5)
 2 == 2
 
 sqrt(2)^2 == 2
+sqrt(2)^2 - 2
 near(sqrt(2)^2, 2)
+?near
 1/pi * pi == 1
 1/49 * 49 == 1
 near(1/49*49, 1)
@@ -49,7 +59,7 @@ filter(flights, month == 5 | month == 6)
 filter(flights, month == 5 | 6)# NO FUNCIONA...
 
 may_june <- filter(flights, month %in% c(5,6))
-
+#LEYES DE MORGAN
 #!(x&y) == (!x)|(!y)
 #!(x|y) == (!x)&(!y)
 
@@ -73,6 +83,7 @@ age.mery == age.john
 is.na(age.mery)
 
 df <- tibble(x = c(1,2,NA,4,5))
+df
 filter(df, x>2)
 filter(df, is.na(x)|x>2)
 
@@ -110,6 +121,8 @@ tail(flights, 10)
 
 ### ARRANGE
 sorted_date <- arrange(flights, year, month, day)
+flights %>% arrange(year, month, day)
+tail(flights)
 tail(sorted_date)
 
 head(arrange(flights, desc(arr_delay)))
@@ -126,7 +139,7 @@ View(arrange(flights, desc(distance)))
 
 ### SELECT
 
-View(sorted_date[1024:1068,])
+View(sorted_date[1024:1068,TRUE])
 
 View(select(sorted_date[1024:1068,], dep_delay, arr_delay))
 
@@ -149,14 +162,13 @@ select(flights, num_range("x",1:5))# x1, x2, x3, x4, x5
 ?select
 
 rename(flights, deptime = dep_time, 
-       año = year, mes = month, dia = day)
+       anio = year, mes = month, dia = day)
 
 select(flights, deptime = dep_time)
 
 select(flights, time_hour, distance, air_time, everything())
 
 
-
 sorted_date
 
 
@@ -199,13 +211,23 @@ flights_new <- select(flights,
                       distance, 
                       air_time)
 
+flights_new
+
 mutate(flights_new,
        time_gain = arr_delay - dep_delay,    #diff_t (min)
        air_time_hour = air_time/60,
        flight_speed = distance/air_time_hour, #v = s/t (km/h)
        time_gain_per_hour = time_gain / air_time_hour
        ) -> flights_new
 
+View(flights_new)
+
+flights_new %>%
+  filter(!is.na(time_gain_per_hour)) %>%
+  ggplot() + 
+  geom_histogram(mapping = aes(x=time_gain_per_hour),
+                 bins = 300)
+
 
 transmute(flights_new,
           time_gain = arr_delay - dep_delay,
@@ -229,7 +251,7 @@ transmute(flights,
 # * Offsets: lead()->mueve hacia la izquierda, lag()->mueve hacia la derecha
 df <- 1:12
 df
-lag(df)
+lag(df,4)
 lead(df)
 # * Funcions acumulativas: cumsum(), cumprod(), cummin(), cummax(), cummean()
 df
@@ -302,16 +324,25 @@ arrange(mutate(flights,
 
 summarise(flights, delay = mean(dep_delay, na.rm = T))
 
-by_month_group <- group_by(flights, year, month)
-summarise(by_month_group, delay = mean(dep_delay, na.rm = T))
+flights %>%
+  group_by(year, month) %>%
+  summarise(delay = mean(dep_delay, na.rm = T))
 
-by_day_group <- group_by(flights, year, month, day)
-summarise(by_day_group, 
+flights %>%
+  group_by(year, month, day) %>%
+  summarise(
           delay = mean(dep_delay, na.rm = T),
           median = median(dep_delay, na.rm = T),
           min = min(dep_delay, na.rm = T)
           )
 
+flights %>%
+  group_by(carrier) %>%
+  summarise(
+    delay = mean(dep_delay, na.rm = T),
+    num = n()
+    ) 
+
 mutate(summarise(group_by(flights, carrier),
           delay = mean(dep_delay, na.rm = T)),
           sorted = min_rank(delay)
@@ -364,8 +395,10 @@ flights %>%
             median = median(dep_delay, na.rm = T),
             sd = sd(dep_delay, na.rm = T),
             count = n()
-  )
+  ) 
 
+not_cancelled <- flights %>%
+  filter(!is.na(dep_delay), !is.na(arr_delay))
 
 delay_numtail <- not_cancelled %>%
   group_by(tailnum) %>%
@@ -454,6 +487,7 @@ not_cancelled %>%
 # Medida de posición 
 not_cancelled %>%
   group_by(carrier) %>%
+  arrange(dep_time) %>%
   summarise(
     first_dep = first(dep_time),
     second_dep = nth(dep_time, 2),
@@ -467,14 +501,16 @@ not_cancelled %>%
   mutate(rank = min_rank(dep_time)) %>%
   filter(rank %in% range(rank)) -> temp
 
+View(temp)
 
 # Funciones de conteo
 flights %>%
   group_by(dest) %>%
   summarise(
     count = n(),
     carriers = n_distinct(carrier),
-    arrivals = sum(!is.na(arr_delay))
+    arrivals = sum(!is.na(arr_delay)),
+    cancelled = count - arrivals
   ) %>%
   arrange(desc(carriers))
 
@@ -505,6 +541,8 @@ summarise(business, n_fl = n()) %>%
   summarise(n_fl = sum(n_fl)) %>%
   summarise(n_fl = sum(n_fl)) 
 
+business
+
 business %>%
   ungroup() %>%
   summarise(n_fl = n())

diff --git a/scripts/tema1/03-eda.R b/scripts/tema1/03-eda.R
@@ -32,14 +32,22 @@ diamonds %>%
   count(cut_width(carat, 0.5))
 
 
+ggplot(diamonds, mapping = aes(x = "Kilates", y = carat)) + 
+  geom_boxplot()
+
+diamonds %>%
+  ggplot() + 
+  geom_boxplot(mapping = aes(x = cut, y = carat, color = cut))
+
 diamonds_filter <- diamonds %>%
   filter(carat<3)
 
 ggplot(data = diamonds_filter) + 
   geom_histogram(mapping = aes(x = carat), binwidth = 0.01)
 
-ggplot(data = diamonds_filter, mapping = aes(x = carat, color = cut))+
-  geom_freqpoly(binwidth = 0.1)
+ggplot(data = diamonds_filter, 
+       mapping = aes(x = carat, color = cut))+
+  geom_freqpoly(binwidth = 0.01)
 
 # * Cuales son los valores más comunes? Por qué?
 # * Cuales son los valores más raros? Por qué? Cumple con lo que esperábamos?
@@ -57,10 +65,20 @@ ggplot(data = faithful, mapping = aes(x = eruptions)) +
   geom_histogram(binwidth = 0.2)
 
 # outliers
-ggplot(diamonds) + 
-  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
+  ggplot(diamonds) + 
+    geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
   coord_cartesian(ylim = c(0,100))
 
+diamonds %>%
+  ggplot(mapping=aes(x = price)) + 
+  geom_histogram(binwidth = 100)
+
+diamonds %>%
+  filter(price > 18000) %>%
+  ggplot(mapping = aes(x = y))+
+  geom_histogram()
+
+
 unusual_diamonds <- diamonds %>%
   filter(y<2 | y >30) %>%
   select(price, x,y,z) %>%
@@ -77,7 +95,8 @@ good_diamonds <- diamonds %>%
 
 ?ifelse  
 
-ggplot(data = good_diamonds, mapping = aes(x = x, y = y)) + 
+ggplot(data = good_diamonds, 
+       mapping = aes(x = x, y = y)) + 
   geom_point(na.rm = T)
 
 nycflights13::flights %>%
@@ -139,13 +158,13 @@ ggplot(good_diamonds) +
 # Categoría vs Contínua
 
 ggplot(data = diamonds, mapping = aes(x = price)) + 
-  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
+  geom_freqpoly(mapping = aes(color = cut), binwidth = 50)
 
 ggplot(diamonds) + 
   geom_bar(mapping = aes(x = cut))
 
 ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
-  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
+  geom_freqpoly(mapping = aes(color = cut), binwidth = 100)
 
 ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + 
   geom_boxplot()
@@ -182,7 +201,8 @@ diamonds %>%
 # Contínua vs Contínua
 
 ggplot(data = diamonds) + 
-  geom_point(mapping = aes(x = carat, y = price), alpha = 0.01)
+  geom_point(mapping = aes(x = carat, y = price), 
+             alpha = 0.01)
 
 install.packages("hexbin")
 library(hexbin)

diff --git a/scripts/tema2/01-tibbles.R b/scripts/tema2/01-tibbles.R
@@ -13,6 +13,7 @@ t <- tibble(
   z = y * x ^ 2
 )
 
+View(t)
 t[2,3]
 
 t2 <- tibble(
@@ -21,7 +22,7 @@ t2 <- tibble(
   `1988` = "number"
 )
 
-t2
+t2$`:)`
 
 tribble(
   ~x, ~y, ~z,

diff --git a/scripts/tema2/02-data-import.R b/scripts/tema2/02-data-import.R
@@ -32,6 +32,7 @@ read_csv("Este fichero fue generado por Juan Gabriel
          4,5,6", skip = 3)
 
 read_csv("#Esto es un comentario
+#Me aburro en clase del master...
          x,y,z
          1,2,3
          4,5,6", comment = "#")

diff --git a/scripts/tema4/04-gapminder.R b/scripts/tema4/04-gapminder.R
@@ -55,7 +55,7 @@ by_country <- by_country %>%
   mutate(model = map(data, country_model))
 
 by_country %>%
-  filter(continent == "Europe")
+   filter(continent == "Europe")
 
 by_country %>%
   arrange(continent, country)