-
Notifications
You must be signed in to change notification settings - Fork 719
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARKNLP-1102] Adding support to read Excel files
- Loading branch information
Showing
9 changed files
with
400 additions
and
54 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package com.johnsnowlabs.reader | ||
|
||
import com.johnsnowlabs.nlp.util.io.ResourceHelper | ||
import com.johnsnowlabs.reader.util.XlsxParser.{RichCell, RichRow} | ||
import org.apache.poi.hssf.usermodel.HSSFWorkbook | ||
import org.apache.poi.ss.usermodel.Workbook | ||
import org.apache.poi.xssf.usermodel.XSSFWorkbook | ||
import org.apache.spark.sql.DataFrame | ||
import org.apache.spark.sql.functions.{col, udf} | ||
|
||
import java.io.ByteArrayInputStream | ||
import scala.collection.JavaConverters._ | ||
import scala.collection.mutable | ||
|
||
class ExcelReader(titleFontSize: Int = 9, cellSeparator: String = "\t") extends Serializable { | ||
|
||
private val spark = ResourceHelper.spark | ||
import spark.implicits._ | ||
|
||
def xls(filePath: String): DataFrame = { | ||
if (ResourceHelper.validFile(filePath)) { | ||
val binaryFilesRDD = spark.sparkContext.binaryFiles(filePath) | ||
val byteArrayRDD = binaryFilesRDD.map { case (path, portableDataStream) => | ||
val byteArray = portableDataStream.toArray() | ||
(path, byteArray) | ||
} | ||
byteArrayRDD | ||
.toDF("path", "content") | ||
.withColumn("xls", parseExcelUDF(col("content"))) | ||
} else throw new IllegalArgumentException(s"Invalid filePath: $filePath") | ||
} | ||
|
||
private val parseExcelUDF = udf((data: Array[Byte]) => { | ||
parseExcel(data) | ||
}) | ||
|
||
// Constants for file type identification | ||
private val ZipMagicNumberFirstByte: Byte = 0x50.toByte // First byte of ZIP files | ||
private val ZipMagicNumberSecondByte: Byte = 0x4b.toByte // Second byte of ZIP files | ||
private val OleMagicNumber: Array[Byte] = | ||
Array(0xd0.toByte, 0xcf.toByte, 0x11.toByte, 0xe0.toByte) // OLE file header | ||
|
||
private def isXlsxFile(content: Array[Byte]): Boolean = { | ||
content.length > 1 && | ||
content(0) == ZipMagicNumberFirstByte && | ||
content(1) == ZipMagicNumberSecondByte | ||
} | ||
|
||
private def isXlsFile(content: Array[Byte]): Boolean = { | ||
content.length >= 4 && content.slice(0, 4).sameElements(OleMagicNumber) | ||
} | ||
|
||
private def parseExcel(content: Array[Byte]): Seq[HTMLElement] = { | ||
val workbookInputStream = new ByteArrayInputStream(content) | ||
val workbook: Workbook = | ||
if (isXlsxFile(content)) new XSSFWorkbook(workbookInputStream) | ||
else if (isXlsFile(content)) new HSSFWorkbook(workbookInputStream) | ||
else throw new IllegalArgumentException("Unsupported file format: must be .xls or .xlsx") | ||
|
||
val elementsBuffer = mutable.ArrayBuffer[HTMLElement]() | ||
|
||
for (sheetIndex <- 0 until workbook.getNumberOfSheets) { | ||
val sheet = workbook.getSheetAt(sheetIndex) | ||
val sheetName = sheet.getSheetName | ||
|
||
val rowIterator = sheet.iterator() | ||
while (rowIterator.hasNext) { | ||
val row = rowIterator.next() | ||
val elementType = | ||
if (row.isTitle(titleFontSize)) ElementType.TITLE else ElementType.NARRATIVE_TEXT | ||
|
||
val cellValues = row.cellIterator().asScala.map(_.getCellValue).toSeq | ||
val content = cellValues.mkString(cellSeparator).trim | ||
|
||
if (content.nonEmpty) { | ||
val element = HTMLElement( | ||
elementType = elementType, | ||
content = content, | ||
metadata = mutable.Map("SheetName" -> sheetName)) | ||
elementsBuffer += element | ||
} | ||
} | ||
} | ||
|
||
workbook.close() | ||
|
||
elementsBuffer | ||
} | ||
|
||
} |
193 changes: 142 additions & 51 deletions
193
src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
Large diffs are not rendered by default.
Oops, something went wrong.
49 changes: 49 additions & 0 deletions
49
src/main/scala/com/johnsnowlabs/reader/util/XlsxParser.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package com.johnsnowlabs.reader.util | ||
|
||
import org.apache.poi.ss.usermodel.{Cell, CellType, DateUtil, HorizontalAlignment, Row} | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
object XlsxParser { | ||
|
||
implicit class RichRow(row: Row) { | ||
|
||
def isTitle(titleFontSizeThreshold: Int): Boolean = { | ||
row.cellIterator().asScala.exists { cell => | ||
val cellStyle = cell.getCellStyle | ||
val font = row.getSheet.getWorkbook.getFontAt(cellStyle.getFontIndexAsInt) | ||
|
||
val isBold = font.getBold | ||
val isCentered = cellStyle.getAlignment == HorizontalAlignment.CENTER | ||
|
||
val text = cell.getCellValue.trim | ||
val isUppercaseOrCapitalized = | ||
text.nonEmpty && (text == text.toUpperCase || text.headOption.exists(_.isUpper)) | ||
|
||
val fontSize = font.getFontHeightInPoints | ||
val isLargeFont = fontSize >= titleFontSizeThreshold | ||
|
||
(isBold && isCentered) || (isBold && isUppercaseOrCapitalized) || (isBold && isLargeFont) | ||
} | ||
} | ||
} | ||
|
||
implicit class RichCell(cell: Cell) { | ||
|
||
def getCellValue: String = { | ||
cell.getCellType match { | ||
case CellType.STRING => cell.getStringCellValue | ||
case CellType.NUMERIC => | ||
if (DateUtil.isCellDateFormatted(cell)) | ||
cell.getDateCellValue.toString | ||
else | ||
cell.getNumericCellValue.toString | ||
case CellType.BOOLEAN => cell.getBooleanCellValue.toString | ||
case CellType.FORMULA => cell.getCellFormula | ||
case _ => "" | ||
} | ||
} | ||
|
||
} | ||
|
||
} |
Binary file added
BIN
+37.5 KB
src/test/resources/reader/xls/2023-half-year-analyses-by-segment.xlsx
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions
35
src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package com.johnsnowlabs.reader | ||
|
||
import com.johnsnowlabs.tags.FastTest | ||
import org.apache.spark.sql.functions.col | ||
import org.scalatest.flatspec.AnyFlatSpec | ||
|
||
class ExcelReaderTest extends AnyFlatSpec { | ||
|
||
val docDirectory = "src/test/resources/reader/xls" | ||
|
||
"ExcelReader" should "read an excel file" taggedAs FastTest in { | ||
val excelReader = new ExcelReader() | ||
val excelDf = excelReader.xls(s"$docDirectory/2023-half-year-analyses-by-segment.xlsx") | ||
excelDf.select("xls").show(false) | ||
|
||
assert(!excelDf.select(col("xls").getItem(0)).isEmpty) | ||
} | ||
|
||
"ExcelReader" should "read a directory of excel files" taggedAs FastTest in { | ||
val excelReader = new ExcelReader() | ||
val excelDf = excelReader.xls(docDirectory) | ||
excelDf.select("xls") show (false) | ||
|
||
assert(!excelDf.select(col("xls").getItem(0)).isEmpty) | ||
} | ||
|
||
"ExcelReader" should "read a directory of excel files with custom cell separator" taggedAs FastTest in { | ||
val excelReader = new ExcelReader(cellSeparator = "\t") | ||
val excelDf = excelReader.xls(s"$docDirectory/vodafone.xlsx") | ||
excelDf.select("xls").show(false) | ||
|
||
assert(!excelDf.select(col("xls").getItem(0)).isEmpty) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters