Skip to content

Commit

Permalink
Improve feed content parsing (#805)
Browse files Browse the repository at this point in the history
* Enable relaxed mode when parsing feeds

* Skip items in feeds if they don't have link or title & description

* Use XML encoding when present while parsing the feed
  • Loading branch information
msasikanth authored Feb 22, 2025
1 parent 3ecacf3 commit 8cdc155
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ internal object AtomContentParser : ContentParser() {

val postPubDateInMillis = date?.dateStringToEpochMillis()

if (title.isNullOrBlank() && content.isNullOrBlank()) {
if (link.isNullOrBlank() || (title.isNullOrBlank() && content.isNullOrBlank())) {
return null
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import io.ktor.http.set
import io.ktor.utils.io.ByteReadChannel
import io.ktor.utils.io.core.readBytes
import korlibs.io.lang.Charset
import korlibs.io.lang.Charsets
import kotlin.coroutines.CoroutineContext
import kotlin.coroutines.EmptyCoroutineContext
import kotlinx.coroutines.runBlocking
Expand All @@ -41,7 +42,11 @@ class FeedParser(private val dispatchersProvider: DispatchersProvider) {
): FeedPayload {
return try {
withContext(dispatchersProvider.io) {
val parser = MiniXmlPullParser(source = content.toCharIterator(charset))
val parser =
MiniXmlPullParser(
source = content.toCharIterator(charset),
relaxed = true,
)

parser.nextTag()

Expand Down Expand Up @@ -120,6 +125,7 @@ private fun ByteReadChannel.toCharIterator(

private val DEFAULT_BUFFER_SIZE = 1024L

private var encodingCharset: Charset? = null
private var currentIndex = 0
private var currentBuffer = String()

Expand All @@ -128,13 +134,34 @@ private fun ByteReadChannel.toCharIterator(
if (this@toCharIterator.isClosedForRead) return false

val packet = runBlocking(context) { this@toCharIterator.readRemaining(DEFAULT_BUFFER_SIZE) }
currentBuffer = buildString { charset.decode(this, packet.readBytes()) }
val bytes = packet.readBytes()
val encodingRegex = """<?xml.*encoding=["']([^"']+)["'].*?>""".toRegex()
if (encodingCharset == null) {
val encodingContent = buildString { Charsets.UTF8.decode(this, bytes) }
encodingCharset = findEncodingCharset(encodingRegex, encodingContent, charset)
}

currentBuffer = buildString { (encodingCharset ?: charset).decode(this, bytes) }

packet.release()
currentIndex = 0
return currentBuffer.isNotEmpty()
}

private fun findEncodingCharset(
encodingRegex: Regex,
encodingContent: String,
fallbackCharset: Charset,
) =
(encodingRegex.find(encodingContent)?.groupValues?.get(1)?.let { encoding ->
try {
Charset.forName(encoding)
} catch (e: Exception) {
null
}
}
?: fallbackCharset)

override fun nextChar(): Char {
if (!hasNext()) throw NoSuchElementException()
return currentBuffer[currentIndex++]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ internal object RDFContentParser : ContentParser() {

val postPubDateInMillis = date?.dateStringToEpochMillis()

if (title.isNullOrBlank() && description.isNullOrBlank()) {
if (link.isNullOrBlank() || (title.isNullOrBlank() && description.isNullOrBlank())) {
return null
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ internal object RSSContentParser : ContentParser() {

val postPubDateInMillis = date?.dateStringToEpochMillis()

if (title.isNullOrBlank() && description.isNullOrBlank()) {
if (link.isNullOrBlank() || (title.isNullOrBlank() && description.isNullOrBlank())) {
return null
}

Expand Down

0 comments on commit 8cdc155

Please sign in to comment.