Skip to content

Commit

Permalink
Merge pull request #60 from ingenerator/2.1-bug-session-invalid-user-…
Browse files Browse the repository at this point in the history
…agent

Fix errors with sessions & logging related to invalid UTF-8
  • Loading branch information
acoulton authored Sep 13, 2024
2 parents f5168a7 + 89cc3da commit 5e8369b
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 4 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
### Unreleased

* Ensure StackdriverApplicationLogger can always log even with invalid UTF8 characters anywhere in payload
* Fix session & logging bugs when incoming user-agent contains invalid UTF8 or escape characters

### v2.1.0 (2024-08-02)

* Support PHP 8.3
Expand Down
8 changes: 6 additions & 2 deletions src/Logging/StackdriverApplicationLogger.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use Ingenerator\PHPUtils\Monitoring\MetricId;
use Ingenerator\PHPUtils\Monitoring\MetricsAgent;
use Ingenerator\PHPUtils\Object\InitialisableSingletonTrait;
use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use Psr\Log\AbstractLogger;
use Psr\Log\LogLevel;
use RuntimeException;
Expand Down Expand Up @@ -383,7 +384,7 @@ protected function writeLog(array $log_entry): void
try {
$success = file_put_contents(
$this->log_destination,
json_encode($log_entry)."\n",
json_encode($log_entry, JSON_INVALID_UTF8_SUBSTITUTE)."\n",
FILE_APPEND
);

Expand Down Expand Up @@ -428,7 +429,10 @@ public function logRequest(?array $server, ?float $request_start_time = NULL): v
// requestMethod, requestUrl, remoteIp are expected to come from metadata provider as they are
// shared with application log entry context
'status' => $http_code,
'userAgent' => $this->truncate($server['HTTP_USER_AGENT'] ?? NULL, 500),
'userAgent' => $this->truncate(
StringSanitiser::ensurePrintableUtf8($server['HTTP_USER_AGENT'] ?? ''),
500
),
'latency' => $this->calculateRequestLatency($request_start_time),
],
]
Expand Down
4 changes: 3 additions & 1 deletion src/Session/MysqlSession.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


use Ingenerator\PHPUtils\DateTime\DateString;
use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use PDO;
use SessionHandlerInterface;

Expand Down Expand Up @@ -168,7 +169,7 @@ public function create_sid(): string
'hash' => $this->calculateHash(),
'data' => '',
'now' => \date('Y-m-d H:i:s'),
'user_agent' => mb_substr($this->client_user_agent, 0, 255),
'user_agent' => StringSanitiser::ensurePrintableUtf8($this->client_user_agent, 255),
'ip' => $this->client_ip,
]
);
Expand Down Expand Up @@ -307,6 +308,7 @@ public function updateTimestamp($session_id, $session_data): bool
*/
protected function calculateHash(): string
{
// Note, intentionally using the *raw* user-agent (as reported by the client) rather than the sanitised one.
$hash = $this->client_user_agent.$this->hash_salt;

return \sha1($hash);
Expand Down
43 changes: 43 additions & 0 deletions src/StringEncoding/StringSanitiser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?php

namespace Ingenerator\PHPUtils\StringEncoding;

class StringSanitiser
{

/**
* Sanitises user input to remove invalid UTF8 character sequences and non-printing ASCII (except \n\r\t)
*
* Invalid characters are replaced with `?`. Use when you want to silently ignore errors in the input, for example
* when logging things like user-agents or other non-critical data, rather than causing encoding or database insert
* errors.
*
* @see https://stackoverflow.com/a/57871683/1062943 for original solution by clarkk, updated to allow more whitespace
*
* @param string $input
* @param int|null $max_length
*
* @return string
*/
public static function ensurePrintableUtf8(string $input, ?int $max_length = null): string
{
$previous_substitute = mb_substitute_character();
mb_substitute_character(0xfffd);
try {
$cleaned = preg_replace(
'/[^[:print:]\n\t\r]/u',
'',
mb_convert_encoding($input, 'UTF-8', 'UTF-8')
);

if ($max_length !== null) {
return mb_substr($cleaned, 0, $max_length);
}

return $cleaned;
} finally {
mb_substitute_character($previous_substitute);
}
}

}
32 changes: 31 additions & 1 deletion test/unit/Logging/StackdriverApplicationLoggerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,32 @@ public function test_it_logs_to_specified_location_as_json_line_with_expected_me
$this->assertSame($messages, $actual, 'Expect correct log messages');
}

public function test_it_handles_invalid_utf8_when_logging()
{
$subject = $this->newSubject();
$line = __LINE__ + 1;
$subject->info("This got truncated somewhere\xD0 then appended to", ['and' => "this was junk\xa3\xa9"]);
$entries = $this->assertLoggedJSONLines();
$this->assertSame(
[
[
'severity' => 'INFO',
'message' => "This got truncated somewhere� then appended to",
'@ingenType' => 'app',
'logging.googleapis.com/sourceLocation' => [
'file' => __FILE__,
'line' => $line,
'function' => __CLASS__.'->'.__FUNCTION__,
],
'custom_context' => [
'and' => 'this was junk��',
],
],
],
$entries
);
}

public function test_it_appends_logs_to_existing_content_if_file_specified()
{
$previous = vfsStream::newFile('existing-log.log')
Expand Down Expand Up @@ -568,8 +594,12 @@ public function test_its_request_logger_logs_http_response_code()
$this->assertSame(402, $entry['httpRequest']['status']);
}

#[TestWith([[], null])]
#[TestWith([[], ''])]
#[TestWith([['HTTP_USER_AGENT' => 'chrome 10'], 'chrome 10'])]
#[TestWith([
['HTTP_USER_AGENT' => "Mozilla/5.0 (compatible; Baiduspider/2.0; \xa3\xa9 and more stuff"],
'Mozilla/5.0 (compatible; Baiduspider/2.0; �� and more stuff',
])]
public function test_its_request_logger_logs_user_agent_from_global_array($server, $expect)
{
http_response_code(402);
Expand Down
122 changes: 122 additions & 0 deletions test/unit/StringEncoding/StringSanitiserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
<?php

namespace test\unit\Ingenerator\PHPUtils\StringEncoding;

use Ingenerator\PHPUtils\StringEncoding\StringSanitiser;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;

class StringSanitiserTest extends TestCase
{

public static function provider_printable_utf8(): array
{
return [
'valid ascii' => [
'I am valid text',
],
'valid ascii with newlines and tabs' => [
"I am valid\nmultiline\ttext with whitespace",
],
'valid ascii with windows newlines' => [
"I am valid\r\nmultiline fróm Windows",
],
'valid UTF8' => [
"Hello from Denmark with æøå",
],
'ascii with BEL control characters' => [
"Ring the \x07",
"Ring the �",
],
'ascii with leading NUL control characters' => [
"\x00nully null",
"�nully null",
],
'ascii with trailing NUL control characters' => [
"nully null\x00",
"nully null�",
],
'ascii with internal NUL control characters' => [
"nully \x00 null",
"nully � null",
],
'utf8 truncated at first escape' => [
"Some partial\xD0",
"Some partial�",
],
'utf8 with invalid trailing bytes' => [
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html\xa3\xa9",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html��",
],
'utf8 with invalid included bytes' => [
"Mozilla/5.0 (compatible; Baiduspider/2.0; \xa3\xa9 and more stuff",
"Mozilla/5.0 (compatible; Baiduspider/2.0; �� and more stuff",
],
'utf8 with invalid leading bytes' => [
"\xa3\xa9Mozilla/5.0 (compatible; Baiduspider/2.0; and more stuff",
"��Mozilla/5.0 (compatible; Baiduspider/2.0; and more stuff",
],
];
}

#[DataProvider('provider_printable_utf8')]
public function test_it_sanitises_to_printable_valid_utf8_string(string $input, ?string $expect = null)
{
$this->assertSame(
$expect ?? $input,
StringSanitiser::ensurePrintableUtf8($input)
);
}

public static function provider_printable_utf8_max_length(): array
{
return [
'with invalid input, still truncates after cleaning' => [
"Mozilla/5.0 \x08 and more \xa3\xa9 nonsense",
[
23 => "Mozilla/5.0 � and more ",
24 => "Mozilla/5.0 � and more �",
25 => "Mozilla/5.0 � and more ��",
255 => "Mozilla/5.0 � and more �� nonsense",
],
],
'with valid input, truncates as multibyte' => [
"Hello from Denmark with æøå and stuff",
[
24 => 'Hello from Denmark with ',
25 => 'Hello from Denmark with æ',
26 => 'Hello from Denmark with æø',
27 => 'Hello from Denmark with æøå',
28 => 'Hello from Denmark with æøå ',
255 => 'Hello from Denmark with æøå and stuff',
],
],
];
}

#[DataProvider('provider_printable_utf8_max_length')]
public function test_it_can_constrain_printable_utf8_to_a_max_length(string $input, array $expected)
{
$actual = [];
foreach (array_keys($expected) as $max_length) {
$actual[$max_length] = StringSanitiser::ensurePrintableUtf8($input, max_length: $max_length);
}
$this->assertSame($expected, $actual);
}

public function test_it_does_not_affect_global_substitute_character_state()
{
$old_encoding = mb_substitute_character();
mb_substitute_character(0x3013);
$input = "whatever \xa3 stuff";
$encoded_with_default = mb_convert_encoding($input, 'UTF-8', 'UTF-8');
$this->assertSame('whatever 〓 stuff', $encoded_with_default);
try {
$this->assertSame('whatever � stuff', StringSanitiser::ensurePrintableUtf8($input));
$this->assertSame($encoded_with_default, mb_convert_encoding($input, 'UTF-8', 'UTF-8'));
} finally {
mb_substitute_character($old_encoding);
}
}

}

0 comments on commit 5e8369b

Please sign in to comment.