Introduction

This document is an exploratory analysis of all accepted full papers, and posters at the GIScience conference series. The analysis is based on the text analysis published in “Reproducible research and GIScience: an evaluation using AGILE conference papers” (https://doi.org/10.7717/peerj.5072).

library("here")
library("pdftools")
library("stringr")
library("tidyverse")
library("tidytext")
library("wordcloud")
library("RColorBrewer")
library("grid")
library("gridBase")
library("gridExtra")
library("kableExtra")
library("quanteda")

# for deterministic cloud rendering
set.seed(nchar("International Conference on Geographic Information Science"))

Load data

List of proceedings

LNCS proceedings are available at the publisher website: https://link.springer.com/conference/giscience.

Note: The 2018 proceedings include the short papers in the same document. For comparability, only the full papers are taken into account for the analysis below.

data_path <- here::here("proceedings")
proceedings <- c(
  "2002" = "geographic-information-science-2002.pdf",
  "2004" = "geographic-information-science-2004.pdf",
  "2006" = "geographic-information-science-2006.pdf",
  "2008" = "geographic-information-science-2008.pdf",
  "2010" = "geographic-information-science-2010.pdf",
  "2012" = "10.1007_978-3-642-33024-7.pdf",
  "2014" = "10.1007_978-3-319-11593-1.pdf",
  "2016" = "10.1007_978-3-319-45738-3.pdf",
  "2018" = "lipics-vol114-giscience2018-complete.pdf"
)
proceedings_files <- file.path(data_path, proceedings)
names(proceedings_files) <- names(proceedings)

Add the PDFs to a directory called /home/rstudio/proceedings next to the file giscience-historic-text-analysis.Rmd (this file). The proceedings of the papers are not openly available for the years 2012 to 2016. You can contact the original paper authors and ask for the test dataset to reproduce the full analysis. Alternatively, you can download the 2018 proceedings from the LIPIcs website (Open Access; direct PDF link) and conduct the analysis with that subset of the data. For the analysis below the following input files were used:

knitr::kable(tibble(year = names(proceedings), file = proceedings)) %>%
  kableExtra::kable_styling("striped", full_width = FALSE)
year file
2002 geographic-information-science-2002.pdf
2004 geographic-information-science-2004.pdf
2006 geographic-information-science-2006.pdf
2008 geographic-information-science-2008.pdf
2010 geographic-information-science-2010.pdf
2012 10.1007_978-3-642-33024-7.pdf
2014 10.1007_978-3-319-11593-1.pdf
2016 10.1007_978-3-319-45738-3.pdf
2018 lipics-vol114-giscience2018-complete.pdf
# Code not evaluated when document is rendered!
dir.create(data_path, showWarnings = FALSE)

library("googledrive")
drive_dir <- googledrive::drive_get("https://drive.google.com/drive/folders/17EUtM_zCx1gQMea1MHN_5XSVrssxv9GA")
drive_dir_contents <- googledrive::drive_ls(drive_dir)
for (i in rownames(drive_dir_contents)) {
  current <- drive_dir_contents[i,]
  if(endsWith(current$name, ".pdf"))
    googledrive::drive_download(as_id(current$id), file.path(data_path, current$name))
}

The text is extracted from PDFs and it is processed to create a tidy data structure without stop words. The stop words include specific words, such as university, which is included in many pages, abbreviations such as e.g., and terms particular to scientific articles, such as figure. Also all numeric literas are removed from the word list.

texts <- lapply(proceedings_files, pdftools::pdf_text)

if(params$with_sp) {
  texts[["2018-sp"]] <- texts[["2018"]][c(283:length(texts[["2018"]]))]
  proceedings_files <- c(proceedings_files, `2018-sp` = proceedings_files[[4]])
}

# don't include short papers in 2018 year
texts[["2018"]] <- texts[["2018"]][c(1:282)]

texts <- unlist(lapply(texts, stringr::str_c, collapse = TRUE))

tidy_texts <- tibble::tibble(year = names(texts),
                             path = proceedings_files,
                             text = texts)

# create a table of all words
all_words <- tidy_texts %>%
  dplyr::select(year, text) %>%
  tidytext::unnest_tokens(word, text)

# remove stop words and remove numbers
my_stop_words <- tibble::tibble(
  word = c(
    "et",
    "al",
    "fig",
    "e.g",
    "i.e",
    "http",
    "ing",
    "pp",
    "figure",
    "based",
    "conference",
    "university",
    "table"
  ),
  lexicon = "giscience"
)

all_stop_words <- stop_words %>%
  dplyr::bind_rows(my_stop_words)
suppressWarnings({
  no_numbers <- all_words %>%
    dplyr::filter(is.na(as.numeric(word)))
})

no_stop_words <- no_numbers %>%
  dplyr::anti_join(all_stop_words, by = "word")

total_words = nrow(no_numbers)
after_cleanup = nrow(no_stop_words)

About 50 % of the words are considered stop words. The following tables shows how many non-stop words each conference year has, sorted by number of non-stop words (descending).

nonstopwords_per_year <- no_stop_words %>%
  dplyr::group_by(year) %>%
  dplyr::summarise(words = n()) %>%
  dplyr::arrange(desc(words)) %>%
  dplyr::rename(`non-stop words` = words)

words_per_year <- no_numbers %>%
  dplyr::group_by(year) %>%
  dplyr::summarise(words = n()) %>%
  dplyr::arrange(desc(words)) %>%
  dplyr::rename(`all words` = words)

dplyr::inner_join(nonstopwords_per_year, words_per_year, by = "year") %>%
  dplyr::bind_rows(tibble(year = "Total",
                   `non-stop words` = sum(nonstopwords_per_year$`non-stop words`),
                   `all words` = sum(words_per_year$`all words`))) %>%
  knitr::kable() %>%
  kableExtra::kable_styling("striped", full_width = FALSE) %>%
  kableExtra::row_spec(nrow(nonstopwords_per_year) + 1, bold = TRUE)
year non-stop words all words
2006 80336 168616
2014 74995 149063
2012 73440 141769
2004 68060 134234
2016 66642 130669
2008 64056 131032
2018 62111 121388
2002 58636 117016
2010 55408 110342
Total 603684 1204129

Top wordstems and wordstem clouds

# chosen manually
minimum_occurence <- 99
max_words <- 100

The following table shows the number of occurence for the 100 most occuring wordstems across all proceedings.

wordstems <- no_stop_words %>%
  dplyr::mutate(wordstem = quanteda::char_wordstem(no_stop_words$word))

countYearsUsingWordstem <- function(the_word) {
  sapply(the_word, function(w) {
    wordstems %>%
      dplyr::filter(wordstem == w) %>%
      dplyr::group_by(year) %>%
      dplyr::count() %>%
      nrow
  })
}

top_wordstems <- wordstems %>%
  dplyr::group_by(wordstem) %>%
  dplyr::tally() %>%
  dplyr::arrange(desc(n)) %>%
  head(n = max_words) %>%
  dplyr::mutate(`years w/ wordstem` = countYearsUsingWordstem(wordstem)) %>%
  tibble::add_column(place = c(1:nrow(.)), .before = 0)

write.csv(top_wordstems, here::here("results/text_analysis_topwordstems.csv"), row.names = FALSE)

top_wordstems %>%
  knitr::kable() %>%
  kableExtra::kable_styling("striped", full_width = FALSE) %>%
  kableExtra::scroll_box(height = "300px")
place wordstem n years w/ wordstem
1 data 7087 9
2 spatial 5442 9
3 model 4249 9
4 relat 4063 9
5 time 3547 9
6 inform 3419 9
7 set 3334 9
8 map 3331 9
9 object 2962 9
10 network 2653 9
11 locat 2642 9
12 region 2560 9
13 geograph 2553 9
14 comput 2347 9
15 result 2301 9
16 space 2267 9
17 algorithm 2234 9
18 node 2191 9
19 approach 1957 9
20 system 1938 9
21 rout 1904 9
22 true 1901 9
23 type 1890 9
24 method 1886 9
25 distanc 1824 9
26 edg 1824 9
27 queri 1721 9
28 process 1683 9
29 similar 1678 9
30 user 1600 9
31 repres 1560 9
32 intersect 1521 9
33 section 1521 9
34 analysi 1499 9
35 line 1487 9
36 road 1458 9
37 ontolog 1442 9
38 direct 1431 9
39 structur 1429 9
40 featur 1331 9
41 graph 1331 9
42 function 1319 9
43 measur 1315 9
44 pattern 1307 9
45 label 1293 9
46 research 1278 9
47 tempor 1276 9
48 paper 1224 9
49 event 1202 9
50 studi 1201 9
51 ed 1198 9
52 refer 1175 9
53 concept 1151 9
54 level 1148 9
55 path 1142 9
56 cell 1097 9
57 semant 1097 9
58 generat 1091 9
59 applic 1078 9
60 topolog 1048 9
61 provid 1036 9
62 class 1024 9
63 scienc 1009 9
64 intern 1008 9
65 oper 1003 9
66 requir 1000 9
67 boundari 992 9
68 term 988 9
69 local 978 9
70 develop 977 9
71 includ 966 9
72 defin 953 9
73 sensor 945 9
74 valu 923 9
75 chang 922 9
76 step 922 9
77 appli 915 9
78 gis 912 9
79 select 906 9
80 instanc 901 9
81 complex 900 9
82 scale 898 9
83 segment 891 9
84 environ 885 9
85 dataset 881 9
86 databas 876 9
87 connect 872 9
88 φ 872 5
89 represent 862 9
90 differ 857 8
91 size 845 9
92 cluster 843 9
93 decis 843 9
94 attribut 841 9
95 search 839 9
96 perform 830 9
97 properti 828 9
98 support 824 9
99 experi 820 9
100 distribut 813 9

The following clouds and table are based on word stems extracted with a stemming algorithm from package quanteda. Words must occur at least 99 times to be included in the cloud. Each cloud has a maximum of 100 words.

cloud_wordstems <- wordstems %>%
  dplyr::group_by(year, wordstem) %>%
  dplyr::tally() %>%
  dplyr::arrange(desc(n))
# plot is created to file to fit more words to a specific pixel size
png(filename = here::here("results/text_analysis_wordstemclouds.png"),
    width = 1000,
    height = 1000)

par(mfrow = c(3,3))
for (the_year in names(proceedings)) {
  year_cloud_wordstems <- cloud_wordstems %>%
    dplyr::filter(year == the_year) %>%
    dplyr::filter(n >= minimum_occurence) %>%
    head(n = max_words)
  #cat(str(year_cloud_wordstems))
  
  wordcloud::wordcloud(words = year_cloud_wordstems$wordstem,
                       freq = year_cloud_wordstems$n,
                       min.freq = 1,
                       random.order = FALSE,
                       fixed.asp = FALSE,
                       rot.per = 0,
                       color = brewer.pal(8, "Dark2"))
}
dev.off()
## png 
##   2
file.copy(from = here::here("results/text_analysis_wordstemclouds.png"),
          to = here::here("docs/text_analysis_wordstemclouds.png"),
          overwrite = TRUE)
## [1] TRUE

World clouds of full papers per conference year (rowwise, starting top left, from 2002 to 2018).

Colophon

This document is licensed under a Creative Commons Attribution 4.0 International License. All contained code is licensed under the Apache License 2.0. This document is versioned in a public git repository, https://github.com/nuest/reproducible-research-at-giscience, and archived on Zenodo at https://doi.org/10.5281/zenodo.4032875.

Runtime environment description:

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value                       
##  version  R version 3.6.3 (2020-02-29)
##  os       Debian GNU/Linux 10 (buster)
##  system   x86_64, linux-gnu           
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       Etc/UTC                     
##  date     2021-06-01                  
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package      * version date       lib source                            
##  askpass        1.1     2019-01-13 [1] CRAN (R 3.6.3)                    
##  assertthat     0.2.1   2019-03-21 [1] CRAN (R 3.6.3)                    
##  backports      1.1.6   2020-04-05 [1] CRAN (R 3.6.3)                    
##  base         * 3.6.3   2020-05-14 [2] local                             
##  broom          0.5.6   2020-04-20 [1] CRAN (R 3.6.3)                    
##  callr          3.4.3   2020-03-28 [1] CRAN (R 3.6.3)                    
##  cellranger     1.1.0   2016-07-27 [1] CRAN (R 3.6.3)                    
##  cli            2.0.2   2020-02-28 [1] CRAN (R 3.6.3)                    
##  colorspace     1.4-1   2019-03-18 [1] CRAN (R 3.6.3)                    
##  compiler       3.6.3   2020-05-14 [2] local                             
##  crayon         1.3.4   2017-09-16 [1] CRAN (R 3.6.3)                    
##  data.table     1.12.8  2019-12-09 [1] CRAN (R 3.6.3)                    
##  datasets     * 3.6.3   2020-05-14 [2] local                             
##  DBI            1.1.0   2019-12-15 [1] CRAN (R 3.6.3)                    
##  dbplyr         1.4.3   2020-04-19 [1] CRAN (R 3.6.3)                    
##  desc           1.2.0   2018-05-01 [1] CRAN (R 3.6.3)                    
##  devtools       2.3.0   2020-04-10 [1] CRAN (R 3.6.3)                    
##  digest         0.6.25  2020-02-23 [1] CRAN (R 3.6.3)                    
##  dplyr        * 0.8.5   2020-03-07 [1] CRAN (R 3.6.3)                    
##  ellipsis       0.3.0   2019-09-20 [1] CRAN (R 3.6.3)                    
##  evaluate       0.14    2019-05-28 [1] CRAN (R 3.6.3)                    
##  fansi          0.4.1   2020-01-08 [1] CRAN (R 3.6.3)                    
##  fastmatch      1.1-0   2017-01-28 [1] CRAN (R 3.6.3)                    
##  forcats      * 0.5.0   2020-03-01 [1] CRAN (R 3.6.3)                    
##  fs             1.4.1   2020-04-04 [1] CRAN (R 3.6.3)                    
##  generics       0.0.2   2018-11-29 [1] CRAN (R 3.6.3)                    
##  ggplot2      * 3.3.0   2020-03-05 [1] CRAN (R 3.6.3)                    
##  glue           1.4.0   2020-04-03 [1] CRAN (R 3.6.3)                    
##  graphics     * 3.6.3   2020-05-14 [2] local                             
##  grDevices    * 3.6.3   2020-05-14 [2] local                             
##  grid         * 3.6.3   2020-05-14 [2] local                             
##  gridBase     * 0.4-7   2014-02-24 [1] CRAN (R 3.6.3)                    
##  gridExtra    * 2.3     2017-09-09 [1] CRAN (R 3.6.3)                    
##  gtable         0.3.0   2019-03-25 [1] CRAN (R 3.6.3)                    
##  haven          2.2.0   2019-11-08 [1] CRAN (R 3.6.3)                    
##  here         * 0.1     2017-05-28 [1] CRAN (R 3.6.3)                    
##  highr          0.8     2019-03-20 [1] CRAN (R 3.6.3)                    
##  hms            0.5.3   2020-01-08 [1] CRAN (R 3.6.3)                    
##  htmltools      0.4.0   2019-10-04 [1] CRAN (R 3.6.3)                    
##  httr           1.4.1   2019-08-05 [1] CRAN (R 3.6.3)                    
##  janeaustenr    0.1.5   2017-06-10 [1] CRAN (R 3.6.3)                    
##  jsonlite       1.6.1   2020-02-02 [1] CRAN (R 3.6.3)                    
##  kableExtra   * 1.1.0   2019-03-16 [1] CRAN (R 3.6.3)                    
##  knitr          1.28    2020-02-06 [1] CRAN (R 3.6.3)                    
##  lattice        0.20-38 2018-11-04 [2] CRAN (R 3.6.3)                    
##  lifecycle      0.2.0   2020-03-06 [1] CRAN (R 3.6.3)                    
##  lubridate      1.7.8   2020-04-06 [1] CRAN (R 3.6.3)                    
##  magrittr       1.5     2014-11-22 [1] CRAN (R 3.6.3)                    
##  Matrix         1.2-18  2019-11-27 [2] CRAN (R 3.6.3)                    
##  memoise        1.1.0   2017-04-21 [1] CRAN (R 3.6.3)                    
##  methods      * 3.6.3   2020-05-14 [2] local                             
##  modelr         0.1.6   2020-02-22 [1] CRAN (R 3.6.3)                    
##  munsell        0.5.0   2018-06-12 [1] CRAN (R 3.6.3)                    
##  nlme           3.1-144 2020-02-06 [2] CRAN (R 3.6.3)                    
##  pdftools     * 2.3     2019-11-10 [1] CRAN (R 3.6.3)                    
##  pillar         1.4.3   2019-12-20 [1] CRAN (R 3.6.3)                    
##  pkgbuild       1.0.6   2019-10-09 [1] CRAN (R 3.6.3)                    
##  pkgconfig      2.0.3   2019-09-22 [1] CRAN (R 3.6.3)                    
##  pkgload        1.0.2   2018-10-29 [1] CRAN (R 3.6.3)                    
##  prettyunits    1.1.1   2020-01-24 [1] CRAN (R 3.6.3)                    
##  processx       3.4.2   2020-02-09 [1] CRAN (R 3.6.3)                    
##  ps             1.3.2   2020-02-13 [1] CRAN (R 3.6.3)                    
##  purrr        * 0.3.4   2020-04-17 [1] CRAN (R 3.6.3)                    
##  qpdf           1.1     2019-03-07 [1] CRAN (R 3.6.3)                    
##  quanteda     * 2.0.1   2020-03-18 [1] CRAN (R 3.6.3)                    
##  R6             2.4.1   2019-11-12 [1] CRAN (R 3.6.3)                    
##  RColorBrewer * 1.1-2   2014-12-07 [1] CRAN (R 3.6.3)                    
##  Rcpp           1.0.4.6 2020-04-09 [1] CRAN (R 3.6.3)                    
##  RcppParallel   5.0.0   2020-03-11 [1] CRAN (R 3.6.3)                    
##  readr        * 1.3.1   2018-12-21 [1] CRAN (R 3.6.3)                    
##  readxl         1.3.1   2019-03-13 [1] CRAN (R 3.6.3)                    
##  remotes        2.1.1   2020-02-15 [1] CRAN (R 3.6.3)                    
##  reprex         0.3.0   2019-05-16 [1] CRAN (R 3.6.3)                    
##  rlang          0.4.5   2020-03-01 [1] CRAN (R 3.6.3)                    
##  rmarkdown      2.5     2021-06-01 [1] Github (rstudio/rmarkdown@4ff2093)
##  rprojroot      1.3-2   2018-01-03 [1] CRAN (R 3.6.3)                    
##  rstudioapi     0.11    2020-02-07 [1] CRAN (R 3.6.3)                    
##  rvest          0.3.5   2019-11-08 [1] CRAN (R 3.6.3)                    
##  scales         1.1.0   2019-11-18 [1] CRAN (R 3.6.3)                    
##  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 3.6.3)                    
##  SnowballC      0.7.0   2020-04-01 [1] CRAN (R 3.6.3)                    
##  stats        * 3.6.3   2020-05-14 [2] local                             
##  stopwords      2.0     2020-04-14 [1] CRAN (R 3.6.3)                    
##  stringi        1.4.6   2020-02-17 [1] CRAN (R 3.6.3)                    
##  stringr      * 1.4.0   2019-02-10 [1] CRAN (R 3.6.3)                    
##  testthat       2.3.2   2020-03-02 [1] CRAN (R 3.6.3)                    
##  tibble       * 3.0.1   2020-04-20 [1] CRAN (R 3.6.3)                    
##  tidyr        * 1.0.2   2020-01-24 [1] CRAN (R 3.6.3)                    
##  tidyselect     1.0.0   2020-01-27 [1] CRAN (R 3.6.3)                    
##  tidytext     * 0.2.4   2020-04-17 [1] CRAN (R 3.6.3)                    
##  tidyverse    * 1.3.0   2019-11-21 [1] CRAN (R 3.6.3)                    
##  tokenizers     0.2.1   2018-03-29 [1] CRAN (R 3.6.3)                    
##  tools          3.6.3   2020-05-14 [2] local                             
##  usethis        1.6.0   2020-04-09 [1] CRAN (R 3.6.3)                    
##  utils        * 3.6.3   2020-05-14 [2] local                             
##  vctrs          0.2.4   2020-03-10 [1] CRAN (R 3.6.3)                    
##  viridisLite    0.3.0   2018-02-01 [1] CRAN (R 3.6.3)                    
##  webshot        0.5.2   2019-11-22 [1] CRAN (R 3.6.3)                    
##  withr          2.2.0   2020-04-20 [1] CRAN (R 3.6.3)                    
##  wordcloud    * 2.6     2018-08-24 [1] CRAN (R 3.6.3)                    
##  xfun           0.15    2021-06-01 [1] Github (yihui/xfun@06e86a6)       
##  xml2           1.3.2   2020-04-23 [1] CRAN (R 3.6.3)                    
##  yaml           2.2.1   2020-02-01 [1] CRAN (R 3.6.3)                    
## 
## [1] /usr/local/lib/R/site-library
## [2] /usr/local/lib/R/library
