Improve feed content parsing (#805)

* Enable relaxed mode when parsing feeds * Skip items in feeds if they don't have link or title & description * Use XML encoding when present while parsing the feed
msasikanth · Feb 22, 2025 · 8cdc155 · 8cdc155
1 parent 3ecacf3
commit 8cdc155
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 5 deletions.
diff --git a/...k/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt b/...k/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt
@@ -131,7 +131,7 @@ internal object AtomContentParser : ContentParser() {
 
     val postPubDateInMillis = date?.dateStringToEpochMillis()
 
-    if (title.isNullOrBlank() && content.isNullOrBlank()) {
+    if (link.isNullOrBlank() || (title.isNullOrBlank() && content.isNullOrBlank())) {
       return null
     }
 

diff --git a/.../network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/FeedParser.kt b/.../network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/FeedParser.kt
@@ -23,6 +23,7 @@ import io.ktor.http.set
 import io.ktor.utils.io.ByteReadChannel
 import io.ktor.utils.io.core.readBytes
 import korlibs.io.lang.Charset
+import korlibs.io.lang.Charsets
 import kotlin.coroutines.CoroutineContext
 import kotlin.coroutines.EmptyCoroutineContext
 import kotlinx.coroutines.runBlocking
@@ -41,7 +42,11 @@ class FeedParser(private val dispatchersProvider: DispatchersProvider) {
   ): FeedPayload {
     return try {
       withContext(dispatchersProvider.io) {
-        val parser = MiniXmlPullParser(source = content.toCharIterator(charset))
+        val parser =
+          MiniXmlPullParser(
+            source = content.toCharIterator(charset),
+            relaxed = true,
+          )
 
         parser.nextTag()
 
@@ -120,6 +125,7 @@ private fun ByteReadChannel.toCharIterator(
 
     private val DEFAULT_BUFFER_SIZE = 1024L
 
+    private var encodingCharset: Charset? = null
     private var currentIndex = 0
     private var currentBuffer = String()
 
@@ -128,13 +134,34 @@ private fun ByteReadChannel.toCharIterator(
       if (this@toCharIterator.isClosedForRead) return false
 
       val packet = runBlocking(context) { this@toCharIterator.readRemaining(DEFAULT_BUFFER_SIZE) }
-      currentBuffer = buildString { charset.decode(this, packet.readBytes()) }
+      val bytes = packet.readBytes()
+      val encodingRegex = """<?xml.*encoding=["']([^"']+)["'].*?>""".toRegex()
+      if (encodingCharset == null) {
+        val encodingContent = buildString { Charsets.UTF8.decode(this, bytes) }
+        encodingCharset = findEncodingCharset(encodingRegex, encodingContent, charset)
+      }
+
+      currentBuffer = buildString { (encodingCharset ?: charset).decode(this, bytes) }
 
       packet.release()
       currentIndex = 0
       return currentBuffer.isNotEmpty()
     }
 
+    private fun findEncodingCharset(
+      encodingRegex: Regex,
+      encodingContent: String,
+      fallbackCharset: Charset,
+    ) =
+      (encodingRegex.find(encodingContent)?.groupValues?.get(1)?.let { encoding ->
+        try {
+          Charset.forName(encoding)
+        } catch (e: Exception) {
+          null
+        }
+      }
+        ?: fallbackCharset)
+
     override fun nextChar(): Char {
       if (!hasNext()) throw NoSuchElementException()
       return currentBuffer[currentIndex++]

diff --git a/...rk/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt b/...rk/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt
@@ -130,7 +130,7 @@ internal object RDFContentParser : ContentParser() {
 
     val postPubDateInMillis = date?.dateStringToEpochMillis()
 
-    if (title.isNullOrBlank() && description.isNullOrBlank()) {
+    if (link.isNullOrBlank() || (title.isNullOrBlank() && description.isNullOrBlank())) {
       return null
     }
 

diff --git a/...rk/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt b/...rk/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt
@@ -139,7 +139,7 @@ internal object RSSContentParser : ContentParser() {
 
     val postPubDateInMillis = date?.dateStringToEpochMillis()
 
-    if (title.isNullOrBlank() && description.isNullOrBlank()) {
+    if (link.isNullOrBlank() || (title.isNullOrBlank() && description.isNullOrBlank())) {
       return null
     }