Skip to content

Commit

Permalink
[SPARKNLP-1102] Adding support to read Excel files
Browse files Browse the repository at this point in the history
  • Loading branch information
danilojsl committed Dec 17, 2024
1 parent acc9369 commit 4657490
Show file tree
Hide file tree
Showing 9 changed files with 400 additions and 54 deletions.
70 changes: 69 additions & 1 deletion python/sparknlp/reader/sparknlp_reader.py

Large diffs are not rendered by default.

15 changes: 14 additions & 1 deletion python/test/sparknlp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,17 @@ def runTest(self):
word_df = sparknlp.read().doc(self.word_file)
word_df.show()

self.assertTrue(word_df.select("doc").count() > 0)
self.assertTrue(word_df.select("doc").count() > 0)

@pytest.mark.fast
class SparkNLPTestExcelFilesSpec(unittest.TestCase):

def setUp(self):
self.data = SparkContextForTest.data
self.excel_file = f"file:///{os.getcwd()}/../src/test/resources/reader/xls/vodafone.xlsx"

def runTest(self):
excel_df = sparknlp.read().xls(self.excel_file)
excel_df.show()

self.assertTrue(excel_df.select("xls").count() > 0)
90 changes: 90 additions & 0 deletions src/main/scala/com/johnsnowlabs/reader/ExcelReader.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package com.johnsnowlabs.reader

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import com.johnsnowlabs.reader.util.XlsxParser.{RichCell, RichRow}
import org.apache.poi.hssf.usermodel.HSSFWorkbook
import org.apache.poi.ss.usermodel.Workbook
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}

import java.io.ByteArrayInputStream
import scala.collection.JavaConverters._
import scala.collection.mutable

class ExcelReader(titleFontSize: Int = 9, cellSeparator: String = "\t") extends Serializable {

private val spark = ResourceHelper.spark
import spark.implicits._

def xls(filePath: String): DataFrame = {
if (ResourceHelper.validFile(filePath)) {
val binaryFilesRDD = spark.sparkContext.binaryFiles(filePath)
val byteArrayRDD = binaryFilesRDD.map { case (path, portableDataStream) =>
val byteArray = portableDataStream.toArray()
(path, byteArray)
}
byteArrayRDD
.toDF("path", "content")
.withColumn("xls", parseExcelUDF(col("content")))
} else throw new IllegalArgumentException(s"Invalid filePath: $filePath")
}

private val parseExcelUDF = udf((data: Array[Byte]) => {
parseExcel(data)
})

// Constants for file type identification
private val ZipMagicNumberFirstByte: Byte = 0x50.toByte // First byte of ZIP files
private val ZipMagicNumberSecondByte: Byte = 0x4b.toByte // Second byte of ZIP files
private val OleMagicNumber: Array[Byte] =
Array(0xd0.toByte, 0xcf.toByte, 0x11.toByte, 0xe0.toByte) // OLE file header

private def isXlsxFile(content: Array[Byte]): Boolean = {
content.length > 1 &&
content(0) == ZipMagicNumberFirstByte &&
content(1) == ZipMagicNumberSecondByte
}

private def isXlsFile(content: Array[Byte]): Boolean = {
content.length >= 4 && content.slice(0, 4).sameElements(OleMagicNumber)
}

private def parseExcel(content: Array[Byte]): Seq[HTMLElement] = {
val workbookInputStream = new ByteArrayInputStream(content)
val workbook: Workbook =
if (isXlsxFile(content)) new XSSFWorkbook(workbookInputStream)
else if (isXlsFile(content)) new HSSFWorkbook(workbookInputStream)
else throw new IllegalArgumentException("Unsupported file format: must be .xls or .xlsx")

val elementsBuffer = mutable.ArrayBuffer[HTMLElement]()

for (sheetIndex <- 0 until workbook.getNumberOfSheets) {
val sheet = workbook.getSheetAt(sheetIndex)
val sheetName = sheet.getSheetName

val rowIterator = sheet.iterator()
while (rowIterator.hasNext) {
val row = rowIterator.next()
val elementType =
if (row.isTitle(titleFontSize)) ElementType.TITLE else ElementType.NARRATIVE_TEXT

val cellValues = row.cellIterator().asScala.map(_.getCellValue).toSeq
val content = cellValues.mkString(cellSeparator).trim

if (content.nonEmpty) {
val element = HTMLElement(
elementType = elementType,
content = content,
metadata = mutable.Map("SheetName" -> sheetName))
elementsBuffer += element
}
}
}

workbook.close()

elementsBuffer
}

}
193 changes: 142 additions & 51 deletions src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions src/main/scala/com/johnsnowlabs/reader/util/XlsxParser.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.johnsnowlabs.reader.util

import org.apache.poi.ss.usermodel.{Cell, CellType, DateUtil, HorizontalAlignment, Row}

import scala.collection.JavaConverters._

object XlsxParser {

implicit class RichRow(row: Row) {

def isTitle(titleFontSizeThreshold: Int): Boolean = {
row.cellIterator().asScala.exists { cell =>
val cellStyle = cell.getCellStyle
val font = row.getSheet.getWorkbook.getFontAt(cellStyle.getFontIndexAsInt)

val isBold = font.getBold
val isCentered = cellStyle.getAlignment == HorizontalAlignment.CENTER

val text = cell.getCellValue.trim
val isUppercaseOrCapitalized =
text.nonEmpty && (text == text.toUpperCase || text.headOption.exists(_.isUpper))

val fontSize = font.getFontHeightInPoints
val isLargeFont = fontSize >= titleFontSizeThreshold

(isBold && isCentered) || (isBold && isUppercaseOrCapitalized) || (isBold && isLargeFont)
}
}
}

implicit class RichCell(cell: Cell) {

def getCellValue: String = {
cell.getCellType match {
case CellType.STRING => cell.getStringCellValue
case CellType.NUMERIC =>
if (DateUtil.isCellDateFormatted(cell))
cell.getDateCellValue.toString
else
cell.getNumericCellValue.toString
case CellType.BOOLEAN => cell.getBooleanCellValue.toString
case CellType.FORMULA => cell.getCellFormula
case _ => ""
}
}

}

}
Binary file not shown.
Binary file added src/test/resources/reader/xls/vodafone.xlsx
Binary file not shown.
35 changes: 35 additions & 0 deletions src/test/scala/com/johnsnowlabs/reader/ExcelReaderTest.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.johnsnowlabs.reader

import com.johnsnowlabs.tags.FastTest
import org.apache.spark.sql.functions.col
import org.scalatest.flatspec.AnyFlatSpec

class ExcelReaderTest extends AnyFlatSpec {

val docDirectory = "src/test/resources/reader/xls"

"ExcelReader" should "read an excel file" taggedAs FastTest in {
val excelReader = new ExcelReader()
val excelDf = excelReader.xls(s"$docDirectory/2023-half-year-analyses-by-segment.xlsx")
excelDf.select("xls").show(false)

assert(!excelDf.select(col("xls").getItem(0)).isEmpty)
}

"ExcelReader" should "read a directory of excel files" taggedAs FastTest in {
val excelReader = new ExcelReader()
val excelDf = excelReader.xls(docDirectory)
excelDf.select("xls") show (false)

assert(!excelDf.select(col("xls").getItem(0)).isEmpty)
}

"ExcelReader" should "read a directory of excel files with custom cell separator" taggedAs FastTest in {
val excelReader = new ExcelReader(cellSeparator = "\t")
val excelDf = excelReader.xls(s"$docDirectory/vodafone.xlsx")
excelDf.select("xls").show(false)

assert(!excelDf.select(col("xls").getItem(0)).isEmpty)
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class WordReaderTest extends AnyFlatSpec {
val wordReader = new WordReader()
val wordDf = wordReader.doc(docDirectory)
wordDf.select("doc").show(false)

wordDf.printSchema()
assert(!wordDf.select(col("doc").getItem(0)).isEmpty)
}

Expand Down

0 comments on commit 4657490

Please sign in to comment.