diff --git a/example-manifests/measure-webpage.yml b/example-manifests/measure-webpage.yml index d354562..b5eb302 100644 --- a/example-manifests/measure-webpage.yml +++ b/example-manifests/measure-webpage.yml @@ -13,7 +13,7 @@ initialize: path: '@tngtech/if-webpage-plugins' config: scrollToBottom: true - url: https://www.thegreenwebfoundation.org/ + url: https://www.tngtech.com 'co2js': method: Co2js path: '@tngtech/if-webpage-plugins' diff --git a/src/__tests__/unit/lib/webpage-impact/index.test.ts b/src/__tests__/unit/lib/webpage-impact/index.test.ts index f5eadd2..5c96128 100644 --- a/src/__tests__/unit/lib/webpage-impact/index.test.ts +++ b/src/__tests__/unit/lib/webpage-impact/index.test.ts @@ -110,25 +110,26 @@ describe('lib/webpage-impact', () => { ]) )[0]; + console.log(data); expect(timestamp).toEqual(expectedtimestampISO); expect(duration).toEqual(0); expect(url).toEqual('http://localhost:3000'); expect(data['network/data/bytes']).toBeGreaterThanOrEqual(2000); expect(data['network/data/bytes']).toBeLessThanOrEqual(2200); expect( - data['network/data/resources/bytes'].document + data['network/data/resources/bytes']['Document'] ).toBeGreaterThanOrEqual(800); expect( - data['network/data/resources/bytes'].document + data['network/data/resources/bytes']['Document'] ).toBeLessThanOrEqual(850); expect( - data['network/data/resources/bytes'].fetch + data['network/data/resources/bytes']['Fetch'] ).toBeGreaterThanOrEqual(800); - expect(data['network/data/resources/bytes'].fetch).toBeLessThanOrEqual( - 850 - ); - expect(data['network/data/resources/bytes'].other).toEqual(422); - expect(data.options.dataReloadRatio).toBeGreaterThanOrEqual(0.4); + expect( + data['network/data/resources/bytes']['Fetch'] + ).toBeLessThanOrEqual(850); + expect(data['network/data/resources/bytes']['Other']).toEqual(422); + expect(data.options.dataReloadRatio).toBeGreaterThanOrEqual(0.45); expect(data.options.dataReloadRatio).toBeLessThanOrEqual(0.5); expect(data.options.firstVisitPercentage).toEqual( testFirstVisitPercentage diff --git a/src/lib/webpage-impact/README.md b/src/lib/webpage-impact/README.md index 2b0e6df..702f58c 100644 --- a/src/lib/webpage-impact/README.md +++ b/src/lib/webpage-impact/README.md @@ -28,8 +28,8 @@ The follwing config parameters are optional: - `network/data/bytes`: page weight in bytes - `network/data/resources/bytes`: resource weights by category in bytes -- `dataReloadRatio`: the percentage of data that is downloaded by return visitors (can be fed into the CO2.JS plugin) - if `options.dataReloadRatio` is already provided in input, the plugin won't calculate it +- `dataReloadRatio`: an estimate of the percentage of data that is downloaded by return visitors (can be fed into the CO2.JS plugin) + if `options.dataReloadRatio` is already provided as an input, the plugin won't calculate it - `timestamp`: set to the time of the plugin execution - `duration`: set to 0 (because the request time does not seem of particular interest here to the author) @@ -45,8 +45,10 @@ The page weight (the number of bytes transferred to load the page) can be feed i Several config options are provided to modify the loading of the page, e.g. emulating a mobile device and network conditions. By scrolling to the bottom of the page one can also take into account lazy loaded resources. Custom accept and accept-encoding request headers can also be provided. -The plugin can also approximate the `dataReloadRatio` that is needed for carbon emissions estimation with the Sustainable Webdesign Model (provided by the co2js plugin). To approximate the `dataReloadRatio` the page weight is calculated for a first visit and a return visit. The difference `weight of initial load - weight of reload` plus the weight of the resources that were loaded from browser cache on reload, is assumed to be the weight of resources that did not need reloading. -This assumption can be off. For example if there is a lot of dynamic content on the page, that is requested only under certain conditions or at specific times. Also, cached resources provided by service workers are not taken into account. Possibly, content personalization can also distort the measurement if initial load and reload do not get comparable content. +Please note: The reported page weight may be smaller than what you expect. For example, a web page might load additional resources after the cookie banner has been closed by the user. The plugin does not interact with the page, except for the option to scroll to the bottom of the page. + +The plugin can also approximate the `dataReloadRatio` that is needed for carbon emissions estimation with the Sustainable Webdesign Model (provided by the co2js plugin). To approximate the `dataReloadRatio` the page weight is calculated for a first visit and a return visit. All the weight of all resources that were served from cache on reload are substracted from the page weight to get the `dataReloadRatio`. +This assumption can be off. For example if there is a lot of dynamic content on the page, that is requested only under certain conditions or at specific times. Possibly, content personalization can also distort the measurement if initial load and reload do not get comparable content. Also, prefetched requests might be served from cache and are counted as reloaded in this case, even though no data was reused. Further remarks: diff --git a/src/lib/webpage-impact/index.ts b/src/lib/webpage-impact/index.ts index ab5d63b..fc733b6 100644 --- a/src/lib/webpage-impact/index.ts +++ b/src/lib/webpage-impact/index.ts @@ -1,17 +1,12 @@ // SPDX-FileCopyrightText: 2024 Alexander zur Bonsen // SPDX SPDX-License-Identifier: Apache-2.0 -// for parts marked as originating from Lighthouse: -// SPDX-FileCopyrightText: 2016 Google LLC -// SPDX-License-Identifier: Apache-2.0 - import puppeteer, { HTTPRequest, - HTTPResponse, KnownDevices, Page, PredefinedNetworkConditions, - ResourceType, + Protocol, } from 'puppeteer'; import {z} from 'zod'; @@ -33,10 +28,8 @@ type WebpageImpactOptions = { type ResourceBase = { url: string; - resourceSize: number; - type: ResourceType; - fromCache: boolean; - fromServiceWorker: boolean; + status: number; + type: Protocol.Network.ResourceType; }; type Resource = ResourceBase & {transferSize: number}; @@ -45,17 +38,6 @@ type Device = keyof typeof KnownDevices; const LOGGER_PREFIX = 'WebpageImpact'; -// copied from lighthouse https://github.com/GoogleChrome/lighthouse/blob/main/core/lib/url-utils.js#L21 -// because it is not exported there -const NON_NETWORK_SCHEMES = [ - 'blob', // @see https://developer.mozilla.org/en-US/docs/Web/API/URL/createObjectURL - 'data', // @see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs - 'intent', // @see https://developer.chrome.com/docs/multidevice/android/intents/ - 'file', // @see https://en.wikipedia.org/wiki/File_URI_scheme - 'filesystem', // @see https://developer.mozilla.org/en-US/docs/Web/API/FileSystem - 'chrome-extension', -]; - const ALLOWED_ENCODINGS = [ 'gzip', 'compress', @@ -118,8 +100,14 @@ export const WebpageImpact = PluginFactory({ inputs.map(async input => { const startTime = Date.now(); + const computeReloadRatio = !input?.options?.dataReloadRatio; + const {pageWeight, resourceTypeWeights, dataReloadRatio} = - await measurePageImpactMetrics(config.url, config); + await measurePageImpactMetrics( + config.url, + computeReloadRatio, + config + ); const durationInSeconds = (Date.now() - startTime) / 1000; @@ -130,7 +118,7 @@ export const WebpageImpact = PluginFactory({ url: config.url, 'network/data/bytes': pageWeight, 'network/data/resources/bytes': resourceTypeWeights, - ...(config.options || dataReloadRatio // TODO not sure it is necessary to copy input.options here in every case instead of referencing them + ...(dataReloadRatio ? { options: { ...input.options, @@ -147,10 +135,9 @@ export const WebpageImpact = PluginFactory({ const WebpageImpactUtils = () => { const measurePageImpactMetrics = async ( url: string, + computeReloadRatio: boolean, config?: ConfigParams ) => { - const computeReloadRatio = !config?.options?.dataReloadRatio; - const requestHandler = (interceptedRequest: HTTPRequest) => { const headers = Object.assign({}, interceptedRequest.headers(), { ...(config?.headers?.accept && { @@ -195,18 +182,17 @@ const WebpageImpactUtils = () => { scrollToBottom: config?.scrollToBottom, }); - const reloadedResources = await loadPageResources(page, url, { - reload: true, - cacheEnabled: true, - scrollToBottom: config?.scrollToBottom, - }); + let reloadedResources: Resource[] | undefined; + if (computeReloadRatio) { + reloadedResources = await loadPageResources(page, url, { + reload: true, + cacheEnabled: true, + scrollToBottom: config?.scrollToBottom, + }); + } return { - ...computeMetrics( - initialResources, - reloadedResources, - computeReloadRatio - ), + ...computeMetrics(initialResources, reloadedResources), }; } finally { await browser.close(); @@ -222,50 +208,53 @@ const WebpageImpactUtils = () => { page: Page, url: string, {reload, cacheEnabled, scrollToBottom}: WebpageImpactOptions - ) => { - const pageResources: ResourceBase[] = []; - - const responseHandler = async (response: HTTPResponse) => { - try { - if (isFromNonNetworkRequest(response) || hasNoResponseBody(response)) { - return; - } - const resource = { - url: response.url(), - resourceSize: (await response.buffer()).length, - fromCache: response.fromCache(), - fromServiceWorker: response.fromServiceWorker(), - type: response.request().resourceType(), - }; - pageResources.push(resource); - } catch (error) { - console.debug( - `${LOGGER_PREFIX}: Couldn't load ${response.url()}, status: ${response.status()}: ${error}` - ); - } - }; - + ): Promise => { try { await page.setCacheEnabled(cacheEnabled); - // the transfer size of a resource is not available from puppeteer's reponse object - // need to take the detour via a Chrome devtools protcol session to read it out - const cdpIntermediateStore = new Map(); - const cdpResponses = new Map(); + // The transfer size of a resource is not available from puppeteer's reponse object. + // Need to take the detour via a Chrome devtools protcol session to get it. + // https://chromedevtools.github.io/devtools-protocol/tot/Network/ + const cdpResponses: Record = {}; + const cdpTransferSizes: Record = {}; const cdpSession = await page.createCDPSession(); await cdpSession.send('Network.enable'); cdpSession.on('Network.responseReceived', event => { - cdpIntermediateStore.set(event.requestId, {url: event.response.url}); + cdpResponses[event.requestId] = { + url: event.response.url, + status: event.response.status, + type: event.type, + }; }); + // Transfer size + // 1) Response served from web + // Network.responseReceived event only contains the number of bytes received for + // the request so far / when the initial response is received. + // The final number can is sent with Network.loadingFinished. + // + // 2) Response served from cache + // If the resource is served from cache, Network.responseReceived contains the + // size of the cached response, while Network.loadingFinished reports size of 0. cdpSession.on('Network.loadingFinished', event => { - const response = cdpIntermediateStore.get(event.requestId); - response && - cdpResponses.set(response.url, { - encodedDataLength: event.encodedDataLength, - }); + cdpTransferSizes[event.requestId] = { + transferSize: event.encodedDataLength, + }; }); - page.on('response', responseHandler); + // TODO: Currently, the amount of cached resources is determined by + // relying on `encodedDataLength` of the `Network.loadingFinished` event. + // It is 0 if the response was served from cache, which corresponds to + // `Network.requestServedFromCache` being true. + // Potentially this can be improved to excluded prefetched responses. + // + // Furter Notes: + // I haven't found good documentation about this event yet, but I assume it also includes prefetch cache + // which I would want to exclude ideally, because it does not reuse data. + // Network.responseReceived event contains two values, + // fromDiskCache and fromPrefetchCache, that allow to derive + // if an item was served from cache. But that misses memory cache. + // (There is also fromServiceWorker, but I don't think that allows a conclusion about caching, + // depends on what the service worker does.) if (!reload) { await page.goto(url, {waitUntil: 'networkidle0'}); @@ -279,10 +268,9 @@ const WebpageImpactUtils = () => { // await page.screenshot({path: './BOTTOM.png'}); } - page.off('response', responseHandler); await cdpSession.detach(); - return mergeCdpResponsesIntoResources(pageResources, cdpResponses); + return mergeCdpData(cdpResponses, cdpTransferSizes); } catch (error) { throw new Error( `${LOGGER_PREFIX}: Error while loading webpage: ${error}` @@ -290,41 +278,24 @@ const WebpageImpactUtils = () => { } }; - // modified from lighthouse https://github.com/GoogleChrome/lighthouse/blob/main/core/lib/url-utils.js - const isFromNonNetworkRequest = (response: HTTPResponse) => { - const url = response.request().url(); - return NON_NETWORK_SCHEMES.some(scheme => url.startsWith(`${scheme}:`)); - }; - - const hasNoResponseBody = (response: HTTPResponse) => { - return ( - response.status() === 204 || // no content - (response.status() >= 300 && response.status() < 400) || // redirect - response.request().method() === 'OPTIONS' // request for options https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS - ); - }; - - const mergeCdpResponsesIntoResources = ( - pageResources: ResourceBase[], - cdpResponses: Map - ) => { - return pageResources.map(resource => { - const cdpResponse = cdpResponses.get(resource.url); - if (!cdpResponse) { + const mergeCdpData = ( + cdpResponses: Record, + cdpTransferSizes: Record + ): Resource[] => { + const pageResources: Resource[] = []; + for (const [requestId, response] of Object.entries(cdpResponses)) { + const transferSize = cdpTransferSizes[requestId]?.transferSize; + if (transferSize === undefined) { console.debug( - `${LOGGER_PREFIX}: No encoded data length for resource: ${resource.url}` + `${LOGGER_PREFIX}: No transfer size found for resource ${response.url}, status: ${response.status}` ); } - return cdpResponse - ? ({ - ...resource, - transferSize: cdpResponse.encodedDataLength, - } as Resource) - : ({ - ...resource, - transferSize: 0, - } as Resource); - }); + pageResources.push({ + ...response, + transferSize: transferSize ?? 0, + }); + } + return pageResources; }; const scrollToBottomOfPage = async () => { @@ -346,8 +317,7 @@ const WebpageImpactUtils = () => { const computeMetrics = ( initialResources: Resource[], - reloadResources: Resource[], - computeReloadRatio: boolean + reloadResources: Resource[] | undefined ) => { const resourceTypeWeights = initialResources.reduce( (acc, resource) => { @@ -358,40 +328,30 @@ const WebpageImpactUtils = () => { } return acc; }, - {} as Record + {} as Record ); const initialPageWeight = Object.values(resourceTypeWeights).reduce( (acc, resourceTypeSize) => acc + resourceTypeSize, 0 ); + // dataReloadRatio: this is an attempt to get a heuristic value + + // Caveats: + // 1) the older pre-fetch syntax () stored + // responses in disk cache as well (https://developer.chrome.com/docs/devtools/application/debugging-speculation-rules) + // 2) dynamic content loading, see README let dataReloadRatio: number | undefined; - if (computeReloadRatio) { - const initialCacheWeight = initialResources.reduce( - (acc, resource) => - acc + (resource.fromCache ? resource.transferSize : 0), - 0 - ); - if (initialCacheWeight > 0) { - console.warn( - `${LOGGER_PREFIX}: Initial page load contained resources from cache.` - ); - } + if (reloadResources !== undefined) { const reloadPageWeight = reloadResources.reduce( (acc, resource) => acc + resource.transferSize, 0 ); - const assumeFromCache = initialPageWeight - reloadPageWeight; - const browserCache = reloadResources.reduce( - (acc, resource) => - acc + (resource.fromCache ? resource.transferSize : 0), - 0 - ); - const assumedCacheWeight = assumeFromCache + browserCache; + const fromCache = initialPageWeight - reloadPageWeight; dataReloadRatio = roundToDecimalPlaces( - (initialPageWeight - assumedCacheWeight) / initialPageWeight, + (initialPageWeight - fromCache) / initialPageWeight, 2 ); } @@ -432,7 +392,6 @@ const WebpageImpactUtils = () => { dataReloadRatio: z.number().optional(), }) .optional(), - lighthouse: z.boolean().optional(), }); const configSchema = z