R/plos_authInflation.R

options(PlosApiKey = "<insert your API key here!>")
#install_github("rplos", "ropensci")
library("rplos")
library("ggplot2")
require("dplyr")

# Convert author strings to counts
countAuths <- function(cell)
  length(unlist(strsplit(cell, ";")))

countAuths <- Vectorize(countAuths)

# Query PLoS API for 1k papers per journal per year,
# count the number of authors and return a data.frame
getAuths <- function(j, lim=1000, start.year=2006){
  cat("Getting results for journal: ", j, "\n")
  # seem to be in reverse order by year?
  results <- sapply(start.year:2013, function(i) data.frame(year = i, 
                auths = searchplos(
                  q  = paste0('publication_date:[', i, 
                              '-01-01T00:00:00Z TO ', i, 
                              '-12-31T23:59:59Z]'), 
                  fl = "author", 
                  fq = list("doc_type:full", 
                            paste0("cross_published_journal_key:", j)),
                 start=0, limit=lim, sleep=6),
                year=i), simplify=F)
  results <- do.call(rbind, results)
  results$counts <- countAuths(results$author)
  results$journal <- j
  results
}

journals <- journalnamekey()
plos.all <- sapply(journals[c(1:5, 7)], getAuths, simplify=F)
plos <- do.call(rbind, plos.all)

# Fig. 1: Bean plot showing distribution of author counts
#         per journal overall
svg("../figures/authInflation_f1_beanplot.svg", 6, 8)
ggplot(plos, aes(x=journal, y=counts, fill=journal)) +
  geom_violin(scale="width") +
  geom_boxplot(width=.12, fill=I("black"), notch=T,
               outlier.size=NA, col="grey40") +
  stat_summary(fun.y="median", geom="point", shape=20, col="white") +
  scale_y_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
  coord_flip() + labs(x="", y="Number of authors per paper") +
  theme_classic() + theme(legend.position="none") +
  scale_fill_brewer()
dev.off()

# Fig 2. ECDFs of the author count distributions
svg("../figures/authInflation_f2_ecdf.svg", 5, 5)
ggplot(plos, aes(x=counts, col=journal)) + 
  stat_ecdf(geom="smooth", se=F, size=1.2) + theme_bw() +
  scale_x_log10(breaks=c(1:5, seq(10, 50, by=10), 100, 200, 300)) +
  theme(legend.position=c(.75,.33)) +
  labs(x="Number of authors per paper", y="ECDF",
       col="") + coord_cartesian(xlim=c(1,300)) +
  scale_color_brewer(type="qual", palette=6)
dev.off()

# Fig 3. Trends in author counts over time with
#        confidence limits on the means
svg("../figures/authInflation_f3_ribbon.svg", 7, 7)
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
  stat_summary(fun.data="mean_cl_boot", geom="ribbon",
               width=.2, alpha=I(.5)) +
  stat_summary(fun.y="mean", geom="line") +
  labs(list(x="Year", y="Mean number of authors per paper")) +
  theme_bw() + theme(legend.position=c(.2,.85)) +
  scale_fill_brewer(type="qual", palette=2, 
                    guide=guide_legend(direction="vertical",
                                       label.position="bottom",
                                       title=NULL, ncol=2,
                                       label.hjust=0.5)) +
  scale_color_brewer(type="qual", palette=2, guide="none")
dev.off()

# from http://stackoverflow.com/a/17024184/1274516
# show regression equation on each graph facet
lm_eqn  <-  function(df){
  m  <- summary(lm(counts ~ year, df))
  eq <- substitute(~~y~"="~beta*x+i~(R^2==r2), 
                   list(beta = format(m$coefficients[2,"Estimate"],
digits = 3),
  i = format(m$coefficients[1,"Estimate"], digits=3),
  r2 = format(m$r.squared, digits=2)))
  as.character(as.expression(eq))                 
}

means <- group_by(plos, journal, year) %.% summarise(counts=mean(counts))
b <- by(means, means$journal, lm_eqn)
df <- data.frame(beta=unclass(b), journal=names(b))
summary(lm(counts ~ year + journal, data=means))

means <- group_by(means, journal) %.% summarise(m=max(counts))
df$top <- means$m * 1.2

# Fig 4. Facetted linear regression of author inflation per journal
svg("../figures/authInflation_f4_regression.svg", 8.5, 6)
ggplot(plos, aes(x=year, y=counts, col=journal, fill=journal)) +
  stat_summary(fun.data="mean_cl_boot", geom="errorbar",
               width=.2, alpha=I(.5)) +
  stat_summary(fun.y="mean", geom="point") +
  facet_wrap(~journal, scales="free_y") +
  geom_smooth(method="lm") +
  scale_x_continuous(breaks=2006:2013) +
  labs(list(x="", y="Mean number of authors per paper")) +
  theme_bw() + theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_fill_brewer(type="qual", palette=2, guide="none") +
  scale_color_brewer(type="qual", palette=2, guide="none") +
  geom_text(data=df, aes(x=2009.5, y=top, label=beta), size=3, parse=T)
dev.off()

# Overall estimate of author inflation? 
# .21 extra authors per paper per year, on average
s <- summary(lm(counts ~ year + journal, data=plos))

# Summary barchart data:
bc <- data.frame(journal = unique(means$journal),
                 trend   = c(0.2490979, 
                             0.1211823, 
                             0.5201688, 
                             0.4088536, 
                             0.05894102, 
                             0.1828939),
                 std.err = c(0.08224567, 
                             0.02213142, 
                             0.1493662, 
                             0.06361849, 
                             0.03891493, 
                             0.03798822),
                 IF      = c(12.690,
                             4.867,
                             8.517,
                             15.253,
                             3.730,
                             8.136))

bc$journal <- factor(bc$journal, levels=bc$journal[order(bc$trend)])

# Fig 5. Barchart of author inflation estimate per journal.
svg("../figures/authInflation_f5_barchart.svg", 5, 7)
ggplot(bc, aes(x=journal, y=trend, fill=journal, ymin=trend-std.err,
               ymax=trend+std.err)) +
  geom_bar(stat="identity") +
  geom_errorbar(width=.2) +
  scale_y_continuous(expand=c(0,0)) +
  theme_classic() + 
  labs(x="", 
       y="Estimate of annual author inflation (additional mean authors per paper)") +
  theme(axis.text.x=element_text(angle=45, hjust=1)) +
  scale_fill_brewer(palette="Blues", guide="none")
dev.off()

pcc <- cor(bc$trend, bc$IF)
# Fig 6. Correlation of author inflation and journal impact factors.
svg("../figures/authInflation_f6_IFcor.svg", 5, 5)
ggplot(bc, aes(x=trend, y=IF, col=journal)) +
  geom_text(aes(label=journal)) + xlim(0,.6) +
  labs(x="Author inflation estimate",
       y="Journal impact factor (2012)") +
  scale_color_brewer(type="qual", palette=2, guide="none") +
  annotate("text", x=.05, y=15, 
           label=paste0("rho == ", format(pcc, digits=2)), parse=T)
dev.off()

# N.S. (p = 0.18)
cor.test(bc$trend, bc$IF)