Inconsistent residue pairs output #62

aozalevsky · 2024-04-02T18:04:13Z

The data returned by the residue-pairs is inconsistent. While the overall size of the output is constant, the content is different every time and thus incomplete.

Here is my test code:

import requests
from requests import session
import tqdm

def request_pride(url: str) -> dict:                                                                                                                                                                                                       
    ''' pull data from PRIDE using crosslinking PDB-DEV API '''                                                                                                                                                                            
    result = None                                                                                                                                                                                                                          
    r = requests.get(url)                                                                                                                                                                                                                  

    if not r.ok:                                                                                                                                                                                                                           
        # Wait until cold request completes and go to DB cache                                                                                                                                                                             
        logging.info('Retrying pulling data from PRIDE')                                                                                                                                                                                   
        time.sleep(60)                                                                                                                                                                                                                     
        r = requests.get(url)                                                                                                                                                                                                              

    if r.ok:                                                                                                                                                                                                                               
        try:                                                                                                                                                                                                                               
            result = r.json()                                                                                                                                                                                                              
        except JSONDecodeError:                                                                                                                                                                                                            
            pass                                                                                                                                                                                                                           

    return result  

def get_residue_pairs_pride(pid: str, page_size: int = 99, mode='passing') -> dict:                                                                                                                                                                  
    '''get sequences from PRIDE entry'''                                                                                                                                                                                                   
    url = f"https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/{pid}/residue-pairs/based-on-reported-psm-level/{mode}"   
    page = 1                                                                                                                                                                                                                               
    url_ = f"{url}?page={page}&page_size={page_size}"                                                                                                                                                                                      
    print(url_)
    result = request_pride(url_)                                                                                                                                                                                                      

    rps = []                                                                                                                                                                                                                               
    if result is not None and 'page' in result:                                                                                                                                                                                            
        session = requests.Session()                                                                                                                                                                                                       

        max_page = int(result['page']["total_pages"])                                                                                                                                                                                      
        page_size = int(result['page']["page_size"])                                                                                                                                                                                       
                                                                                                                                                                                                                          
        for i in tqdm.trange(1, max_page + 1):                                                                                                                                                                                                   
            url_ = f"{url}?page={i}&page_size={page_size}"                                                                                                                                                                                 
            rps_ = session.get(url_).json()['data']                                                                                                                                                                                        
            rps.extend(rps_)                                                                                                                                                                                                               

    return rps

And the output

pid = 'PXD036833'

# Call it 10 times
rps_list = []

for i in range(10):
    rps = get_residue_pairs_pride(pid)
    rps_list.append(rps)
 
# Check unique residue pairs in each call
for i in range(10):
    ms_res_pairs = rps_list[i]
    sxls = []
    for r in ms_res_pairs:
        eid1 = r['prot1']
        rid1 = r['pos1']
        eid2 = r['prot2']
        rid2 = r['pos2']
        sxl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
        sxls.append(sxl)
        
    print(len(ms_res_pairs), len(set(sxls)))

1052 436
1052 434
1052 438
1052 485
1052 492
1052 475
1052 484
1052 487
1052 454
1052 400

Only by combining all 10 independent calls did I get the correct number of 611 unique residue pairs, as reported by XiView (without TD and DD decoys).

rps_all_ = []
for i in range(10):
    rps_all_.extend(rps_list[i])

ms_res_pairs = rps_all_
sxls = []
for r in ms_res_pairs:
    eid1 = r['prot1']
    rid1 = r['pos1']
    eid2 = r['prot2']
    rid2 = r['pos2']
    sxl = tuple(sorted(((eid1, rid1), (eid2, rid2))))
    sxls.append(sxl)

print(len(sxls), len(set(sxls)))
10520 611

Most surprisingly, the number of duplicated IDs is also inconsistent. Thus, duplication seems to happen on a per-line rather than per-page basis.

sorted(Counter([x['id'] for x in rps_list[0]]).items(), key=lambda x:x[1], reverse=True)

[('SII_184564_1', 12),
 ('SII_177702_1', 8),
 ('SII_128053_1', 8),
 ('SII_77809_1', 6),
 ('SII_22560_1', 6),
 ('SII_56123_1', 4),
 ('SII_196399_1', 4),
 <...>

Let me know if you can confirm the behavior or find an error in my test.

colin-combe · 2024-04-03T08:07:51Z

@sureshhewabi - is it still possible to use the API without paging? (i.e. set page size to unlimited)
(It should be fast enough not to timeout for these datasets if the slow down issue was fixed.)

sureshhewabi · 2024-04-03T10:04:44Z

@colin-combe I think the SQL you wrote for this does not return unique pairs although @aozalevsky expecting unique residue pairs in each call. Even though we remove the pagination, it will not fix the issue. Could you please modify the SQL according to @aozalevsky requirements?

colin-combe · 2024-04-03T11:33:35Z

there is a circumstance in which the returned residue pairs will not be unique - that is, you might get ProteinA Residue 1 to ProteinB residue 100 and ProteinB Residue 100 to ProteinA residue 1.
i.e. it can report a link "both ways round" - i know it's not ideal, but is it a big problem? The fix to the SQL will be annoying, but it should be done at some point.

It is duplicate ID's @aozalevsky is concerned about?

Duplicate ID's are deliberate, assuming that the residue pairs are unique (with the exception above). The duplicate ID's denote ambiguity in the position of the peptide and, thereby, in the position of the links.

So

i think there aren't exact duplicate residue pairs (should be at most two due to ends being reversed)
the duplicate ID's are a feature and should be associated with different residue pairs

Let me know if either of these two things are not the case. Its now harder to debug coz you can't switch off pagination.

the content is different every time and thus incomplete.

this is the real problem, not duplication?

colin-combe · 2024-04-03T11:36:27Z

Duplicate ID's are deliberate, assuming that the residue pairs are unique (with the exception above).

yeah, i know thats confusing... i'm trying to say duplicate IDs should be associated with different residue pairs.
This represents the same match corresponding to several possible links.

colin-combe · 2024-04-03T11:43:23Z

...i'll check some of the claims i just made

aozalevsky · 2024-04-03T13:14:06Z

@colin-combe

ProteinA Residue 1 to ProteinB residue 100 and ProteinB Residue 100 to ProteinA residue 1.

Is not an issue. I'm using the following line

sxl = tuple(sorted(((eid1, rid1), (eid2, rid2))))

to convert all residue pairs to a unified representation so i can a) deduplicate b) compare to residue pairs from PDB-DEV entry.

The problem is that I always get exactly 1052 residue pair records, but they are a) different, and b) the list is incomplete (i.e., i can't get all 611 unique residue pairs after deduplication).

I thought the residue pair ID (e.g., SII_184564_1) should be unique in the mzidentml file, no?

Apparently, it can be associated with multiple residue pairs (which is a bit confusing), but the problems are inconsistent and incompleteness. Check the example (printing all residue pairs with a given ID from 3 independent calls):

for r in rps_list[0]:
    if r['id'] == 'SII_177702_1':
        print(r)

{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}

for r in rps_list[2]:
    if r['id'] == 'SII_177702_1':
        print(r)

{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE47_target', 'prot2_acc': 'P0CE47', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE48_target', 'prot1_acc': 'P0CE48', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}
{'id': 'SII_177702_1', 'file': 'membrane_5pc_1200IDs.mzid', 'pass': True, 'prot1': 'dbseq_P0CE47_target', 'prot1_acc': 'P0CE47', 'pos1': 265, 'prot2': 'dbseq_P0CE48_target', 'prot2_acc': 'P0CE48', 'pos2': 260}

for r in rps_list[3]:
    if r['id'] == 'SII_177702_1':
        print(r)

#empty

colin-combe · 2024-04-03T13:16:50Z

(was downloading and parsing PXD036833 locally hence delay)

I'll take this as an example query (this is for @sureshhewabi 's info):

SELECT si.id, u.identification_file_name as file, si.pass_threshold as pass,
            pe1.dbsequence_ref as prot1, dbs1.accession as prot1_acc, (pe1.pep_start + mp1.link_site1 - 1) as pos1,
            pe2.dbsequence_ref as prot2, dbs2.accession as prot2_acc, (pe2.pep_start + mp2.link_site1 - 1) as pos2
            FROM spectrumidentification si INNER JOIN
            modifiedpeptide mp1 ON si.pep1_id = mp1.id AND si.upload_id = mp1.upload_id INNER JOIN
            peptideevidence pe1 ON mp1.id = pe1.peptide_ref AND mp1.upload_id = pe1.upload_id INNER JOIN
            dbsequence dbs1 ON pe1.dbsequence_ref = dbs1.id AND pe1.upload_id = dbs1.upload_id INNER JOIN
            modifiedpeptide mp2 ON si.pep2_id = mp2.id AND si.upload_id = mp2.upload_id INNER JOIN
            peptideevidence pe2 ON mp2.id = pe2.peptide_ref AND mp2.upload_id = pe2.upload_id INNER JOIN
            dbsequence dbs2 ON pe2.dbsequence_ref = dbs2.id AND pe2.upload_id = dbs2.upload_id INNER JOIN
            upload u on u.id = si.upload_id
            WHERE u.id IN (%UR UPLOAD ID FOR PXD036833 HERE%) AND mp1.link_site1 > 0 AND mp2.link_site1 > 0 AND pe1.is_decoy = false AND pe2.is_decoy = false
            AND si.pass_threshold = true
            AND si.id = 'SII_184564_1';

so that's the query from here: https://github.com/PRIDE-Archive/xiview-api/blob/python3/app/routes/pdbdev.py#L148-L161
with the LIMIT and OFFSET replaced by AND si.id = 'SII_184564_1'

SII_184564_1 is the match id with the most duplicates, in xiVIEW it looks like this:

The result of the SQl query is this:

SII_184564_1	membrane_5pc_1200IDs.mzid	true	dbseq_P0CE48_target	P0CE48	278	dbseq_P0CE48_target	P0CE48	257
SII_184564_1	membrane_5pc_1200IDs.mzid	true	dbseq_P0CE48_target	P0CE48	278	dbseq_P0CE47_target	P0CE47	257
SII_184564_1	membrane_5pc_1200IDs.mzid	true	dbseq_P0CE47_target	P0CE47	278	dbseq_P0CE48_target	P0CE48	257
SII_184564_1	membrane_5pc_1200IDs.mzid	true	dbseq_P0CE47_target	P0CE47	278	dbseq_P0CE47_target	P0CE47	257

so there is duplication of that match ID 4 times, but each time is a different pair of residues. I think it's correct but confusing? (xiVIEW screenshot shows how two homologous proteins results in 4 possible links.)

However, in Arthurs test code that ID came up 12 times? So there is additional duplication coming from somewhere that isn't just the SQL query? (EDIT - or rather, if its the sql query then its to do with the LIMIT and OFFSET)

colin-combe · 2024-04-03T13:25:58Z

@aozalevsky - i wrote the above before reading your message,

thought the residue pair ID (e.g., SII_184564_1) should be unique in the mzidentml file, no?

that is the id of the match (aka Spectrum Identification Item), and it can result in 4 different residue pairs.

Apparently, it can be associated with multiple residue pairs (which is a bit confusing)

the idea of duplicating the match ID in the output is that you can then know which links are possibly just alternative explanations of the same thing. (@grandrea)

Do you think this makes sense or do we need to look at this again?

but the problems are inconsistent and incompleteness.

ok

colin-combe · 2024-04-03T15:57:18Z

@sureshhewabi - i'll add a GROUP BY so there isn't duplication of residue pairs, basically, just like you first asked...

colin-combe · 2024-04-03T16:09:01Z

@sureshhewabi - have a look at this PR - PRIDE-Archive/xiview-api#5

I quickly added GROUP BY to the PDB-Dev endpoint, (i tested the SQL but not the actual API code, sorry)
is it more useful to people like this? @aozalevsky

It entails some other changes (some things become arrays)

sureshhewabi · 2024-04-04T07:46:12Z

@colin-combe deployed your changes. Let me know if you still want to remove pagination

colin-combe · 2024-04-04T09:22:39Z

Hi @sureshhewabi,

I was a bit slow realising what was going on yesterday - i sent some irrelevant messages before properly realising the structure of the data we we're sending out at that time. I.e. that it had a shed load of residue-pair duplication because it was one item per match rather than one item per residue pair. Then i realised what you were asking for.

@colin-combe deployed your changes.

OK, thanks. Has it fixed the problem with inconsistencies is the question.

But I guess one data item per unique residue pair suits Arthur better.

Also, some fields in the response might not be necessary? E.g. the 'pass' field, as it seems like consumers might only ever ask for passing matches. (In which case API params and the code could also be simplified.)

Also, the "match_ids" and "files" attributes are maybe not necessary (they are now yucky arrays of values to compensate for the GROUP BY). Depends on whether consumers are interested in following up more detail about the match.

If we're keeping "match_ids", should maybe rename it to "ident_ids" or something because 'match' isn't mzIdentML terminology.

Let me know if you still want to remove pagination

i don't have such a strong view on that. I feel it makes it more annoying to consume but allowing unlimited pagesize risks it timing out (at some point in future coz it should be OK now).

Maybe increase the default and maximum page sizes?

@aozalevsky - Arthur do you have a view about paging?

colin-combe · 2024-04-04T09:29:32Z

(if it still behaves inconsistently, i might take out the pagination just to check if it behaves as expected then)

aozalevsky · 2024-04-04T13:06:46Z

@sureshhewabi i'm getting 503 errors for all endpoints, including swagger docs. also https://www.ebi.ac.uk/pride/archive/crosslinking seems to be empty.

@colin-combe Well, as we've discussed earlier, we need to keep file because there might be multiple mzIdentML files corresponding to completely different searches. On our side, we are also working on implementing a file field for crosslinks (ihmwg/IHMCIF#103) to keep track of the provenance.

Pagination makes things slightly more complicated, but I'm totally fine with/or without it.

Pros:

Each request is smaller/faster, great for dynamically loading pages
Cons:
This kind of data is usually assessed all at once (no point in dynamic loading)
Each request inevitably adds overhead

My only suggestion is to increase the page size to, let's say, 1000 or 10000 lines. Transferring up to 1MB in one request is, IMHO, totally fine with modern connection speeds.

sureshhewabi · 2024-04-04T13:36:31Z

@aozalevsky Sorry it was down for few minutes. I would like to know how you are using this endpoint. Because whenever it is down, you get immediately spotted. My understanding was you use this endpoint just to run your pipeline, or may be periodically(as a cron job). So far, my understanding was, it is not that critical to have a few minutes down time, although it shouldn't be the case. If that is the case, I need to implement a test environment.

sureshhewabi · 2024-04-04T13:39:18Z

I believe we need a test environment at some point, however as there are other priorities we did not wont to do that at this moment. Please let me know how often you use the API, how critical the down time for you.

aozalevsky · 2024-04-04T14:14:05Z

@sureshhewabi Indeed, I use it rarely to generate static reports like this (I showed an example during the call).

Probably it's just a coincidence :) I'm in a different timezone (San Francisco, GMT-8) with a slightly weird schedule to accommodate occasional calls with collaborators from Europe and India, so I guess when you leave it for an "end of the day deployment," my work day just starts :)

sureshhewabi · 2024-04-04T14:23:52Z

@aozalevsky Got it. Thanks for your clarification.

Would you like to join us on Slack channel so that if you experience any issues you can directly communicate with us, instead of writing here in issues? because they are just temporary issues most of the time.

sureshhewabi · 2024-04-04T14:26:32Z

On the other hand, just as a suggestion, if this could break your reports, may be you can cache/store data at your side and pull data periodically from API.

aozalevsky · 2024-04-04T14:38:54Z

@sureshhewabi @colin-combe

The output now returns 675 records, with a correct number of 611 residue pairs after deduplication:

The order of records is not reproducible (it's not an issue, just sharing the the observation):

for i, (ri, rj) in enumerate(zip(rps_list[0], rps_list[1])):
    if ri != rj:
        print('#' * 80)
        print(i)
        print('-' * 40)
        print(ri)
        print('-' * 40)
        print(rj)

################################################################################
19
----------------------------------------
{'match_ids': ['SII_197232_1', 'SII_21729_1', 'SII_48226_1', 'SII_164195_1'], 'files': ['membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid'], 'pass': [True, True, True, True], 'prot1': 'dbseq_P02931_target', 'prot1_acc': 'P02931', 'pos1': 139, 'prot2': 'dbseq_P02931_target', 'prot2_acc': 'P02931', 'pos2': 42}
----------------------------------------
{'match_ids': ['SII_21729_1', 'SII_197232_1', 'SII_48226_1', 'SII_164195_1'], 'files': ['membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid'], 'pass': [True, True, True, True], 'prot1': 'dbseq_P02931_target', 'prot1_acc': 'P02931', 'pos1': 139, 'prot2': 'dbseq_P02931_target', 'prot2_acc': 'P02931', 'pos2': 42}
################################################################################
25
----------------------------------------
{'match_ids': ['SII_1335_1', 'SII_122897_1', 'SII_182749_1', 'SII_64018_1'], 'files': ['membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid'], 'pass': [True, True, True, True], 'prot1': 'dbseq_P02931_target', 'prot1_acc': 'P02931', 'pos1': 304, 'prot2': 'dbseq_P02931_target', 'prot2_acc': 'P02931', 'pos2': 227}
----------------------------------------
{'match_ids': ['SII_122897_1', 'SII_182749_1', 'SII_1335_1', 'SII_64018_1'], 'files': ['membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid', 'membrane_5pc_1200IDs.mzid'], 'pass': [True, True, True, True], 'prot1': 'dbseq_P02931_target', 'prot1_acc': 'P02931', 'pos1': 304, 'prot2': 'dbseq_P02931_target', 'prot2_acc': 'P02931', 'pos2': 227}
<...>

I'm not sure if you've disabled the pagination or if it misbehaves:
https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/PXD036833/residue-pairs/based-on-reported-psm-level/passing?page=1&page_size=99

returns "total_elements":1,"total_pages":1

{"page_no":1,"page_size":99,"total_elements":1,"total_pages":1}}

This is clearly not true because I can manually request all other pages (up to 7) and eventually get complete data.

yep, slack is fine. can you add me using [email protected]?

aozalevsky · 2024-04-04T14:43:16Z

@sureshhewabi Yes, we have caching on our side and are already using it for PRIDE data.

colin-combe · 2024-04-04T14:47:16Z

The output now returns 675 records, with a correct number of 611 residue pairs after deduplication

i think the remaining duplication is the end swapping (unimportant as you say)

The order of records is not reproducible (it's not an issue, just sharing the the observation):

@sureshhewabi - this would make the paging unreliable, like we saw before?

colin-combe · 2024-04-04T15:03:15Z

@sureshhewabi - the SQL queries need an ORDER BY clause if we have pagination?
https://stackoverflow.com/questions/49736430/why-postgres-returns-unordered-data-in-select-query-after-updation-of-row

sureshhewabi · 2024-04-04T15:14:03Z

@colin-combe Let me check this tomorrow.

aozalevsky · 2024-04-23T12:35:54Z

@colin-combe @sureshhewabi any updates? the biggest issue so far is the incorrect counts (total_elements and total_pages) in the response.

https://www.ebi.ac.uk/pride/ws/archive/crosslinking/pdbdev/projects/PXD036833/residue-pairs/based-on-reported-psm-level/passing?page=1&page_size=99

  "page": {
    "page_no": 1,
    "page_size": 99,
    "total_elements": 1,
    "total_pages": 1
  }

colin-combe · 2024-04-23T13:06:26Z

I'm pretty confident that anything that uses pagination based on SQL queries needs to have an ORDER BY clause, so that was a mistake, which should now be fixed. But I think you know about this.

it was mainly @sureshhewabi that worked on the pagination. I have looked at it and don't see a problem. I think I'm stating the obvious but i guess https://github.com/PRIDE-Archive/xiview-api/blob/python3/app/routes/pdbdev.py#L215 isn't returning what we think...

Could you look into it @sureshhewabi ?

There was also a suggestion about pagesize -

My only suggestion is to increase the page size to, let's say, 1000 or 10000 lines. Transferring up to 1MB in one request is, IMHO, totally fine with modern connection speeds.

sureshhewabi · 2024-04-23T13:18:03Z

@colin-combe increasing the page size to 10,000 first. I am deploying this first

sureshhewabi · 2024-04-23T14:10:25Z

Fixed the pagination issue as well. Next is to check the inconsistency

colin-combe · 2024-04-23T14:13:36Z

i think the inconsistency was fixed by adding the ORDER BY

(PRIDE-Archive/xiview-api@84e3bf7)

colin-combe · 2024-09-17T10:21:30Z

this is fixed and can be closed?

aozalevsky · 2024-09-17T21:04:24Z

yes. the ordering of the data (at least for "match_ids") changes with each call, but the total output is full and consistent.

colin-combe · 2024-09-18T06:14:05Z

the ordering of the data (at least for "match_ids") changes with each call, but the total output is full and consistent.

to clarify, the ordering of the rows must not change with each call or the total output will potentially be incomplete and inconsistent.

So, when you say the ordering of the data changes, it is only the order of the ids in the array aggregated 'match_ids' column?
The rows do not change order?

aozalevsky · 2024-09-18T14:07:27Z

I rechecked again. Right the current behaviour is:

The order of rows is preserved
The order of match_ids is not preserved.

it's ok for our purposes.

sureshhewabi closed this as completed Sep 23, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Inconsistent residue pairs output #62

Inconsistent residue pairs output #62

aozalevsky commented Apr 2, 2024

colin-combe commented Apr 3, 2024

sureshhewabi commented Apr 3, 2024

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

aozalevsky commented Apr 3, 2024

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

sureshhewabi commented Apr 4, 2024

colin-combe commented Apr 4, 2024 •

edited

Loading

colin-combe commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

aozalevsky commented Apr 4, 2024 •

edited

Loading

colin-combe commented Apr 4, 2024

colin-combe commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 23, 2024

colin-combe commented Apr 23, 2024 •

edited

Loading

sureshhewabi commented Apr 23, 2024

sureshhewabi commented Apr 23, 2024

colin-combe commented Apr 23, 2024 •

edited

Loading

colin-combe commented Sep 17, 2024

aozalevsky commented Sep 17, 2024

colin-combe commented Sep 18, 2024

aozalevsky commented Sep 18, 2024

Inconsistent residue pairs output #62

Inconsistent residue pairs output #62

Comments

aozalevsky commented Apr 2, 2024

colin-combe commented Apr 3, 2024

sureshhewabi commented Apr 3, 2024

colin-combe commented Apr 3, 2024 • edited Loading

colin-combe commented Apr 3, 2024 • edited Loading

colin-combe commented Apr 3, 2024 • edited Loading

aozalevsky commented Apr 3, 2024

colin-combe commented Apr 3, 2024 • edited Loading

colin-combe commented Apr 3, 2024 • edited Loading

colin-combe commented Apr 3, 2024 • edited Loading

colin-combe commented Apr 3, 2024 • edited Loading

sureshhewabi commented Apr 4, 2024

colin-combe commented Apr 4, 2024 • edited Loading

colin-combe commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 4, 2024

aozalevsky commented Apr 4, 2024 • edited Loading

colin-combe commented Apr 4, 2024

colin-combe commented Apr 4, 2024

sureshhewabi commented Apr 4, 2024

aozalevsky commented Apr 23, 2024

colin-combe commented Apr 23, 2024 • edited Loading

sureshhewabi commented Apr 23, 2024

sureshhewabi commented Apr 23, 2024

colin-combe commented Apr 23, 2024 • edited Loading

colin-combe commented Sep 17, 2024

aozalevsky commented Sep 17, 2024

colin-combe commented Sep 18, 2024

aozalevsky commented Sep 18, 2024

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 3, 2024 •

edited

Loading

colin-combe commented Apr 4, 2024 •

edited

Loading

aozalevsky commented Apr 4, 2024 •

edited

Loading

colin-combe commented Apr 23, 2024 •

edited

Loading

colin-combe commented Apr 23, 2024 •

edited

Loading