Joining Data

Code for Quiz 6, more dplyr and our first interactive chart using echarts4r.

Steps 1-6

  1. Load the R packages we will use.
library(tidyverse)
library(echarts4r)  #install this package before using
library(hrbrthemes) #install this package before using
  1. Read the data in the files, drug_cos.csv, health_cos.csv in to R and assign to the variables drug_cos and health_cos, respectively
drug_cos  <- read_csv("https://estanny.com/static/week6/drug_cos.csv")
health_cos  <- read_csv("https://estanny.com/static/week6/health_cos.csv")
  1. Use glimpse to get a glimpse of the data
drug_cos %>% glimpse()
Rows: 104
Columns: 9
$ ticker       <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS"~
$ name         <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoet~
$ location     <chr> "New Jersey; U.S.A", "New Jersey; U.S.A", "New ~
$ ebitdamargin <dbl> 0.149, 0.217, 0.222, 0.238, 0.182, 0.335, 0.366~
$ grossmargin  <dbl> 0.610, 0.640, 0.634, 0.641, 0.635, 0.659, 0.666~
$ netmargin    <dbl> 0.058, 0.101, 0.111, 0.122, 0.071, 0.168, 0.163~
$ ros          <dbl> 0.101, 0.171, 0.176, 0.195, 0.140, 0.286, 0.321~
$ roe          <dbl> 0.069, 0.113, 0.612, 0.465, 0.285, 0.587, 0.488~
$ year         <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,~
health_cos %>% glimpse() 
Rows: 464
Columns: 11
$ ticker      <chr> "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS", "ZTS",~
$ name        <chr> "Zoetis Inc", "Zoetis Inc", "Zoetis Inc", "Zoeti~
$ revenue     <dbl> 4233000000, 4336000000, 4561000000, 4785000000, ~
$ gp          <dbl> 2581000000, 2773000000, 2892000000, 3068000000, ~
$ rnd         <dbl> 427000000, 409000000, 399000000, 396000000, 3640~
$ netincome   <dbl> 245000000, 436000000, 504000000, 583000000, 3390~
$ assets      <dbl> 5711000000, 6262000000, 6558000000, 6588000000, ~
$ liabilities <dbl> 1975000000, 2221000000, 5596000000, 5251000000, ~
$ marketcap   <dbl> NA, NA, 16345223371, 21572007994, 23860348635, 2~
$ year        <dbl> 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, ~
$ industry    <chr> "Drug Manufacturers - Specialty & Generic", "Dru~
  1. Which variables are the same in both data sets
names_drug  <- drug_cos  %>%  names() 
names_health  <- health_cos  %>%  names() 
intersect(names_drug, names_health)
[1] "ticker" "name"   "year"  
  1. Select subset of variables to work with
drug_subset  <- drug_cos  %>% 
  select(ticker, year, grossmargin)  %>% 
  filter(year == 2018)

health_subset  <- health_cos  %>%
  select(ticker, year, revenue, gp, industry)  %>% 
  filter(year == 2018)
  1. Keep all the rows and columns drug_subset join with columns in health_subset
drug_subset  %>% left_join(health_subset)
# A tibble: 13 x 6
   ticker  year grossmargin     revenue          gp industry          
   <chr>  <dbl>       <dbl>       <dbl>       <dbl> <chr>             
 1 ZTS     2018       0.672  5825000000  3914000000 Drug Manufacturer~
 2 PRGO    2018       0.387  4731700000  1831500000 Drug Manufacturer~
 3 PFE     2018       0.79  53647000000 42399000000 Drug Manufacturer~
 4 MYL     2018       0.35  11433900000  4001600000 Drug Manufacturer~
 5 MRK     2018       0.681 42294000000 28785000000 Drug Manufacturer~
 6 LLY     2018       0.738 24555700000 18125700000 Drug Manufacturer~
 7 JNJ     2018       0.668 81581000000 54490000000 Drug Manufacturer~
 8 GILD    2018       0.781 22127000000 17274000000 Drug Manufacturer~
 9 BMY     2018       0.71  22561000000 16014000000 Drug Manufacturer~
10 BIIB    2018       0.865 13452900000 11636600000 Drug Manufacturer~
11 AMGN    2018       0.827 23747000000 19646000000 Drug Manufacturer~
12 AGN     2018       0.861 15787400000 13596000000 Drug Manufacturer~
13 ABBV    2018       0.764 32753000000 25035000000 Drug Manufacturer~

Questions: join_ticker

drug_cos_subset  <- drug_cos  %>% 
  filter(ticker == "JNJ")

drug_cos_subset
# A tibble: 8 x 9
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 JNJ    John~ New Jer~        0.247       0.687     0.149 0.199 0.161
2 JNJ    John~ New Jer~        0.272       0.678     0.161 0.218 0.173
3 JNJ    John~ New Jer~        0.281       0.687     0.194 0.224 0.197
4 JNJ    John~ New Jer~        0.336       0.694     0.22  0.284 0.217
5 JNJ    John~ New Jer~        0.335       0.693     0.22  0.282 0.219
6 JNJ    John~ New Jer~        0.338       0.697     0.23  0.286 0.229
7 JNJ    John~ New Jer~        0.317       0.667     0.017 0.243 0.019
8 JNJ    John~ New Jer~        0.318       0.668     0.188 0.233 0.244
# ... with 1 more variable: year <dbl>
combo_df  <- drug_cos_subset  %>% 
  left_join(health_cos)

combo_df
# A tibble: 8 x 17
  ticker name  location ebitdamargin grossmargin netmargin   ros   roe
  <chr>  <chr> <chr>           <dbl>       <dbl>     <dbl> <dbl> <dbl>
1 JNJ    John~ New Jer~        0.247       0.687     0.149 0.199 0.161
2 JNJ    John~ New Jer~        0.272       0.678     0.161 0.218 0.173
3 JNJ    John~ New Jer~        0.281       0.687     0.194 0.224 0.197
4 JNJ    John~ New Jer~        0.336       0.694     0.22  0.284 0.217
5 JNJ    John~ New Jer~        0.335       0.693     0.22  0.282 0.219
6 JNJ    John~ New Jer~        0.338       0.697     0.23  0.286 0.229
7 JNJ    John~ New Jer~        0.317       0.667     0.017 0.243 0.019
8 JNJ    John~ New Jer~        0.318       0.668     0.188 0.233 0.244
# ... with 9 more variables: year <dbl>, revenue <dbl>, gp <dbl>,
#   rnd <dbl>, netincome <dbl>, assets <dbl>, liabilities <dbl>,
#   marketcap <dbl>, industry <chr>
co_name  <- combo_df  %>% 
  distinct(name) %>% 
  pull()

co_location  <- combo_df  %>% 
  distinct(location)  %>% 
  pull() 

co_industry  <- combo_df  %>% 
  distinct(industry)  %>% 
  pull() 

Put the r inline commands used in the blanks below. When you knit the document the results of the commands will be displayed in your text.

The company Johnson & Johnson is located in New Jersey; U.S.A and is a member of the Drug Manufacturers industry group.


Start with combo_df

Select variables (in this order): year, grossmargin, netmargin, revenue, gp, netincome

combo_df_subset  <- combo_df  %>% 
  select(year, grossmargin, netmargin, revenue, gp, netincome)
combo_df_subset
# A tibble: 8 x 6
   year grossmargin netmargin     revenue          gp   netincome
  <dbl>       <dbl>     <dbl>       <dbl>       <dbl>       <dbl>
1  2011       0.687     0.149 65030000000 44670000000  9672000000
2  2012       0.678     0.161 67224000000 45566000000 10853000000
3  2013       0.687     0.194 71312000000 48970000000 13831000000
4  2014       0.694     0.22  74331000000 51585000000 16323000000
5  2015       0.693     0.22  70074000000 48538000000 15409000000
6  2016       0.697     0.23  71890000000 50101000000 16540000000
7  2017       0.667     0.017 76450000000 51011000000  1300000000
8  2018       0.668     0.188 81581000000 54490000000 15297000000
combo_df_subset  %>% 
  mutate(grossmargin_check = gp / revenue,
  close_enough = abs(grossmargin_check - grossmargin) < 0.001)
# A tibble: 8 x 8
   year grossmargin netmargin     revenue          gp   netincome
  <dbl>       <dbl>     <dbl>       <dbl>       <dbl>       <dbl>
1  2011       0.687     0.149 65030000000 44670000000  9672000000
2  2012       0.678     0.161 67224000000 45566000000 10853000000
3  2013       0.687     0.194 71312000000 48970000000 13831000000
4  2014       0.694     0.22  74331000000 51585000000 16323000000
5  2015       0.693     0.22  70074000000 48538000000 15409000000
6  2016       0.697     0.23  71890000000 50101000000 16540000000
7  2017       0.667     0.017 76450000000 51011000000  1300000000
8  2018       0.668     0.188 81581000000 54490000000 15297000000
# ... with 2 more variables: grossmargin_check <dbl>,
#   close_enough <lgl>

combo_df_subset  %>% 
  mutate(netmargin_check = netincome / revenue,
  close_enough = abs(netmargin_check - netmargin) < 0.001)
# A tibble: 8 x 8
   year grossmargin netmargin     revenue          gp   netincome
  <dbl>       <dbl>     <dbl>       <dbl>       <dbl>       <dbl>
1  2011       0.687     0.149 65030000000 44670000000  9672000000
2  2012       0.678     0.161 67224000000 45566000000 10853000000
3  2013       0.687     0.194 71312000000 48970000000 13831000000
4  2014       0.694     0.22  74331000000 51585000000 16323000000
5  2015       0.693     0.22  70074000000 48538000000 15409000000
6  2016       0.697     0.23  71890000000 50101000000 16540000000
7  2017       0.667     0.017 76450000000 51011000000  1300000000
8  2018       0.668     0.188 81581000000 54490000000 15297000000
# ... with 2 more variables: netmargin_check <dbl>,
#   close_enough <lgl>

Question: summarize_industry

-Fill in the blanks

-Put the command you use in the Rchunks in the Rmd file for this quiz

-Use the health_cos data

-For each industry calculate

health_cos  %>% 
  group_by(industry)  %>% 
  summarize(mean_netmargin_percent = mean(netincome / revenue) * 100 ,
            median_netmargin_percent = median(netincome / revenue) * 100 ,
            min_netmargin_percent = min(netincome / revenue) * 100 ,
            max_netmargin_percent = max(netincome / revenue) * 100)
# A tibble: 9 x 5
  industry          mean_netmargin_~ median_netmargi~ min_netmargin_p~
  <chr>                        <dbl>            <dbl>            <dbl>
1 Biotechnology                -4.66             7.62         -197.   
2 Diagnostics & Re~            13.1             12.3             0.399
3 Drug Manufacture~            19.4             19.5           -34.9  
4 Drug Manufacture~             5.88             9.01          -76.0  
5 Healthcare Plans              3.28             3.37           -0.305
6 Medical Care Fac~             6.10             6.46            1.40 
7 Medical Devices              12.4             14.3           -56.1  
8 Medical Distribu~             1.70             1.03           -0.102
9 Medical Instrume~            12.3             14.0           -47.1  
# ... with 1 more variable: max_netmargin_percent <dbl>

Question: inline_ticker

health_cos_subset  <- health_cos  %>% 
  filter(ticker == "AMGN")
health_cos_subset
# A tibble: 8 x 11
  ticker name     revenue      gp    rnd netincome  assets liabilities
  <chr>  <chr>      <dbl>   <dbl>  <dbl>     <dbl>   <dbl>       <dbl>
1 AMGN   Amgen I~ 1.56e10 1.29e10 3.17e9    3.68e9 4.89e10 29842000000
2 AMGN   Amgen I~ 1.73e10 1.41e10 3.38e9    4.34e9 5.43e10 35238000000
3 AMGN   Amgen I~ 1.87e10 1.53e10 4.08e9    5.08e9 6.61e10 44029000000
4 AMGN   Amgen I~ 2.01e10 1.56e10 4.30e9    5.16e9 6.90e10 43231000000
5 AMGN   Amgen I~ 2.17e10 1.74e10 4.07e9    6.94e9 7.14e10 43366000000
6 AMGN   Amgen I~ 2.30e10 1.88e10 3.84e9    7.72e9 7.76e10 47751000000
7 AMGN   Amgen I~ 2.28e10 1.88e10 3.56e9    1.98e9 8.00e10 54713000000
8 AMGN   Amgen I~ 2.37e10 1.96e10 3.74e9    8.39e9 6.64e10 53916000000
# ... with 3 more variables: marketcap <dbl>, year <dbl>,
#   industry <chr>

Run the code below

health_cos_subset  %>% 
  distinct(name) %>%  
  pull(name)
[1] "Amgen Inc"
co_name  <- health_cos_subset  %>% 
  distinct(name) %>% 
  pull(name)

You can take output from your code and include it in your text.

co_industry  <- health_cos_subset  %>% 
distinct(industry) %>% 
  pull(industry)

Steps 7-11

  1. Prepare the data for the plots
df <- health_cos  %>% 
  group_by(industry)  %>%
  summarize(med_rnd_rev = median(rnd/revenue))
  1. Use glimpse to glimpse the data for the plots
df  %>% glimpse()
Rows: 9
Columns: 2
$ industry    <chr> "Biotechnology", "Diagnostics & Research", "Drug~
$ med_rnd_rev <dbl> 0.48317287, 0.05620271, 0.17451442, 0.06851879, ~
  1. Create a static bar chart
ggplot(data = df, 
       mapping = aes(
         x = reorder(industry, med_rnd_rev ),
         y = med_rnd_rev
         )) +
  geom_col() + 
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(
    title = "Median R&D expenditures",
    subtitle = "by industry as a percent of revenue from 2011 to 2018",
    x = NULL, y = NULL) +
  theme_ipsum()

ggsave(filename = "preview.png", 
       path = here::here("_posts", "2022-03-08-joining-data"))
df  %>% 
  arrange(med_rnd_rev)  %>%
  e_charts(
    x = industry
    )  %>% 
  e_bar(
    serie = med_rnd_rev, 
    name = "median"
    )  %>%
  e_flip_coords()  %>% 
  e_tooltip()  %>% 
  e_title(
    text = "Median industry R&D expenditures", 
    subtext = "by industry as a percent of revenue from 2011 to 2018",
    left = "center") %>% 
  e_legend(FALSE) %>% 
  e_x_axis(
    formatter = e_axis_formatter("percent", digits = 0)
    )  %>%
  e_y_axis(
    show = FALSE
  )  %>% 
  e_theme("infographic")