Quickly read Excel worksheets into R (Windows only…sorry)
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.
I suppose most companies use the Microsoft Office suite of programs, and my office is no exception. It easy to import data from an API or a database into R, but importing data from an Excel workbook is a different story. There are a few R packages for reading Excel files, but I’ve had problems with all of them:
- `read.xlsx` (`gdata` package): pretty convenient to run in R, but requires Perl which for some reason I have a hard time installing on my Windows machine…that might just be an issue with me, not the machine.
- `odbcConnectExcel2007` (`RODBC` package): from what I’ve seen on the listservs, this one has a hard time reading xlsx files because of a driver mismatch – you have to access the files through 32-bit R, which is annoying.
- `readWorksheetFromFile` (`XLConnect` package): uses Java, easy to install, and has tons of functionality to write in addition to read, but I don’t really need the write functionality and for large files especially XLConnect is very slow.
So I set off in search of a faster way to pull information out of an Excel file. The gist below shows what I came up with. Excel already has Visual Basic capabilities built in. So I stole a little VB script from here and stuck it in a function that writes the script to a temporary file, calls the script from the command line, and then outputs the contents of the formerly-Excel file.
The function takes the following inputs:
- file_path: the full path of the Excel file
- keep_sheets: the names of the sheets you want to keep; extracts all sheets if NULL
- target_dir: the directory to which you want to output the excel sheets; defaults to the directory where the Excel sheet is kept. If set to FALSE, it doesn’t output any files – rather, it reads them all into R as a list of data frames.
Unfortunately, this only works on Windows. I had hoped it would work on a Mac where Excel was installed, but I haven’t had any luck getting it to work on my home computer.
# The MIT License (MIT) | |
# | |
# Copyright (c) 2012 Schaun Jacob Wheeler | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
excelToCsv <- function(file_path, keep_sheets = NULL, target_dir = NULL, ...) { | |
temp_already <- list.files(tempdir()) | |
if(is.null(target_dir)) { | |
file_root <- gsub("([[:print:]]+(/|\\\\))[[:print:]]+", "\\1", file_path) | |
} else if(!is.null(target_dir) & target_dir != FALSE) { | |
file_root <- target_dir | |
} | |
file_name <- gsub("[[:print:]]+(/|\\\\)", "", file_path) | |
file_ext <- gsub("[[:print:]]+(.xls.?)", "\\1", file_path) | |
converter_file <- file(paste0(tempdir(),"/", "converter.vbs")) | |
writeLines( | |
c('rem XLS_To_CSV.vbs', | |
'rem =============================================================', | |
'rem convert all NON-empty worksheets in an Excel file to csv', | |
'rem CSV file names will default to Sheet names', | |
'rem output folder defaults to the folder where the script resides or', | |
'rem if path is specified with the input file, that path is used', | |
'rem ', | |
'rem input parameter 1: Excel path\\file in argument 1 ', | |
'rem (if path is not specified, the current path is defaulted)', | |
'rem ', | |
'rem ============================================================', | |
'', | |
'Dim strExcelFileName', | |
'Dim strCSVFileName', | |
'', | |
'strExcelFileName = WScript.Arguments.Item(0)', | |
'', | |
'rem get path where script is running', | |
'Set fso = CreateObject ("Scripting.FileSystemObject")', | |
'strScript = Wscript.ScriptFullName', | |
'strScriptPath = fso.GetAbsolutePathName(strScript & "\\..")', | |
'', | |
'rem If the Input file is NOT qualified with a path, default the current path', | |
'LPosition = InStrRev(strExcelFileName, "\\") ', | |
'if LPosition = 0 Then ', | |
' strExcelFileName = strScriptPath & "\\" & strExcelFileName', | |
'strScriptPath = strScriptPath & "\\" ', | |
'else ', | |
'strScriptPath = Mid(strExcelFileName, 1, LPosition) ', | |
'End If', | |
'rem msgbox LPosition & " - " & strExcelFileName & " - " & strScriptPath', | |
'', | |
'Set objXL = CreateObject("Excel.Application")', | |
'Set objWorkBook = objXL.Workbooks.Open(strExcelFileName)', | |
'objXL.DisplayAlerts = False', | |
'', | |
'rem loop over worksheets', | |
' For Each sheet In objWorkBook.Sheets ', | |
'if objXL.Application.WorksheetFunction.CountA(sheet.Cells) <> 0 Then ', | |
'rem sheet.Rows(1).delete', | |
'sheet.SaveAs strScriptPath & sheet.Name & ".csv", 6', | |
' End If', | |
' Next', | |
'', | |
'rem clean up ', | |
'objWorkBook.Close ', | |
'objXL.quit', | |
'Set objXL = Nothing ', | |
'Set objWorkBook = Nothing', | |
'Set fso = Nothing', | |
'', | |
'rem end script'), | |
con = converter_file) | |
close(converter_file) | |
file.copy(file_path, tempdir()) | |
orig_wd <- getwd() | |
setwd(tempdir()) | |
file.rename(file_name, paste0("filetoconvert", file_ext)) | |
shell(paste("converter.vbs", | |
paste0("filetoconvert", file_ext)), intern = TRUE) | |
setwd(orig_wd) | |
if(is.null(keep_sheets)) { | |
keep_sheets <- gsub("\\.csv", "", list.files(tempdir(), pattern = "\\.csv")) | |
} | |
file_flags <- paste0(keep_sheets, ".csv") | |
if(is.null(target_dir) | (!is.null(target_dir) & target_dir != FALSE)) { | |
for(i in 1:length(file_flags)) { | |
file.copy( | |
paste0(tempdir(), "/", file_flags[i]), file_root, overwrite = TRUE) | |
} | |
} else { | |
all_files <- lapply(file_flags, function(x) { | |
csv_file <- read.csv(paste0(tempdir(), "/", x), | |
as.is = TRUE, na.strings = c("#N/A", "NA", "N/A", "?", "")) | |
csv_file[,sapply(csv_file, function(y) mean(is.na(y), na.rm = TRUE)) < 1] | |
}) | |
if(length(all_files) == 1) { | |
all_files <- all_files[[1]] | |
} else { | |
names(all_files) <- keep_sheets | |
} | |
} | |
suppressWarnings(file.remove( | |
paste0(tempdir(), | |
"/", | |
list.files(tempdir())[!(list.files(tempdir()) %in% temp_already)]))) | |
if(!is.null(target_dir) & target_dir == FALSE) { | |
all_files | |
} | |
} |
R-bloggers.com offers daily e-mail updates about R news and tutorials about learning R and many other topics. Click here if you're looking to post or find an R/data-science job.
Want to share your content on R-bloggers? click here if you have a blog, or here if you don't.