Coverage for application / vars / vars_qaqc_processor.py: 88%
408 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-23 05:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-23 05:22 +0000
1from application.util.functions import extract_recorded_datetime, get_association
2from application.vars.vars_annotation_processor import VarsAnnotationProcessor
5class VarsQaqcProcessor(VarsAnnotationProcessor):
6 """
7 Filters and formats annotations for the various DARC QA/QC checks.
8 """
10 def __init__(self, sequence_names: list, vars_charybdis_url: str, vars_kb_url: str):
11 super().__init__(sequence_names, vars_charybdis_url, vars_kb_url)
13 def find_duplicate_associations(self):
14 """
15 Finds annotations that have more than one of the same association besides s2
16 """
17 for name in self.sequence_names:
18 for annotation in self.fetch_media_and_annotations(name, images_only=False):
19 if annotation.get('group') == 'localization':
20 continue
21 # get list of associations
22 association_set = set()
23 duplicate_associations = False
24 for association in annotation['associations']:
25 name = association['link_name']
26 if name not in association_set:
27 if name != 's2':
28 association_set.add(name)
29 else:
30 duplicate_associations = True
31 break
32 if duplicate_associations:
33 self.working_records.append(annotation)
34 self.sort_records(self.process_working_records())
36 def find_missing_s1(self):
37 """
38 Finds annotations that are missing s1 (ignores 'none' records)
39 """
40 for name in self.sequence_names:
41 for annotation in self.fetch_media_and_annotations(name, images_only=False):
42 if annotation['concept'] == 'none' or annotation.get('group') == 'localization':
43 continue
44 s1 = get_association(annotation, 's1')
45 if not s1:
46 self.working_records.append(annotation)
47 self.sort_records(self.process_working_records())
49 def find_identical_s1_s2(self):
50 """
51 Finds annotations that have an s2 association that is the same as its s1 association
52 """
53 for name in self.sequence_names:
54 for annotation in self.fetch_media_and_annotations(name, images_only=False):
55 if annotation.get('group') == 'localization':
56 continue
57 s2s = []
58 s1 = ''
59 for association in annotation['associations']:
60 if association['link_name'] == 's1':
61 s1 = association['to_concept']
62 elif association['link_name'] == 's2':
63 s2s.append(association['to_concept'])
64 if s1 in s2s:
65 self.working_records.append(annotation)
66 self.sort_records(self.process_working_records())
68 def find_duplicate_s2(self):
69 """
70 Finds annotations that have multiple s2 associations with the same value
71 """
72 for name in self.sequence_names:
73 for annotation in self.fetch_media_and_annotations(name, images_only=False):
74 if annotation.get('group') == 'localization':
75 continue
76 duplicate_s2s = False
77 s2_set = set()
78 for association in annotation['associations']:
79 if association['link_name'] == 's2':
80 if association['to_concept'] in s2_set:
81 duplicate_s2s = True
82 break
83 else:
84 s2_set.add(association['to_concept'])
85 if duplicate_s2s:
86 self.working_records.append(annotation)
87 self.sort_records(self.process_working_records())
89 def find_missing_upon_substrate(self):
90 """
91 Finds annotations that have an upon association that is not an organism, but the 'upon' is not present in s1 or
92 any s2
93 """
94 for name in self.sequence_names:
95 for annotation in self.fetch_media_and_annotations(name, images_only=False):
96 if annotation.get('group') == 'localization':
97 continue
98 upon = None
99 missing_upon = False
100 for association in annotation['associations']:
101 if association['link_name'] == 'upon':
102 if (association['to_concept'] and association['to_concept'][0].isupper()) \
103 or association['to_concept'].startswith('dead'):
104 # 'upon' is an organism, don't need it to be in s1/s2
105 pass
106 else:
107 # 'upon' should be in s1 or s2
108 upon = association['to_concept']
109 break
110 if upon:
111 missing_upon = True
112 for association in annotation['associations']:
113 if (association['link_name'] == 's1' or association['link_name'] == 's2') \
114 and association['to_concept'] == upon:
115 missing_upon = False
116 break
117 if missing_upon:
118 self.working_records.append(annotation)
119 self.sort_records(self.process_working_records())
121 def find_mismatched_substrates(self):
122 """
123 Finds annotations that occur at the same timestamp (same second) but have different substrates
124 """
125 for name in self.sequence_names:
126 annotations_with_same_timestamp = {}
127 sorted_annotations = sorted(self.fetch_media_and_annotations(name, images_only=False), key=lambda d: d['recorded_timestamp'])
128 # loop through all annotations, add ones with same timestamp to dict
129 i = 0
130 while i < len(sorted_annotations) - 2:
131 if sorted_annotations[i].get('group') == 'localization':
132 i += 1
133 continue
134 base_timestamp = sorted_annotations[i]['recorded_timestamp'][:19]
135 base_annotation = sorted_annotations[i]
136 i += 1
137 while i < len(sorted_annotations) and sorted_annotations[i]['recorded_timestamp'][:19] == base_timestamp:
138 if sorted_annotations[i].get('group') != 'localization':
139 if base_timestamp not in annotations_with_same_timestamp.keys():
140 annotations_with_same_timestamp[base_timestamp] = [base_annotation]
141 annotations_with_same_timestamp[base_timestamp].append(sorted_annotations[i])
142 i += 1
143 # loop through each annotation that shares the same timestamp, compare substrates
144 for timestamp_key in annotations_with_same_timestamp.keys():
145 base_substrates = {'s2': set()}
146 check_substrates = {'s2': set()}
147 for association in annotations_with_same_timestamp[timestamp_key][0]['associations']:
148 if association['link_name'] == 's1':
149 base_substrates['s1'] = association['to_concept']
150 if association['link_name'] == 's2':
151 base_substrates['s2'].add(association['to_concept'])
152 for i in range(1, len(annotations_with_same_timestamp[timestamp_key])):
153 for association in annotations_with_same_timestamp[timestamp_key][i]['associations']:
154 if association['link_name'] == 's1':
155 check_substrates['s1'] = association['to_concept']
156 if association['link_name'] == 's2':
157 check_substrates['s2'].add(association['to_concept'])
158 if base_substrates != check_substrates:
159 for annotation in annotations_with_same_timestamp[timestamp_key]:
160 self.working_records.append(annotation)
161 self.sort_records(self.process_working_records())
163 def find_missing_upon(self):
164 """
165 Finds annotations that are missing upon (ignores 'none' records)
166 """
167 for name in self.sequence_names:
168 for annotation in self.fetch_media_and_annotations(name, images_only=False):
169 if annotation['concept'] == 'none' or annotation.get('group') == 'localization':
170 continue
171 if not get_association(annotation, 'upon'):
172 self.working_records.append(annotation)
173 self.sort_records(self.process_working_records())
175 def get_num_records_missing_ancillary_data(self):
176 """
177 Finds number of annotations that are missing ancillary data
178 """
179 num_records_missing = 0
180 for name in self.sequence_names:
181 for annotation in self.fetch_media_and_annotations(name, images_only=False):
182 if 'ancillary_data' not in annotation.keys():
183 num_records_missing += 1
184 return num_records_missing
186 def find_missing_ancillary_data(self):
187 """
188 Finds annotations that are missing ancillary data (can be very slow)
189 """
190 for name in self.sequence_names:
191 for annotation in self.fetch_media_and_annotations(name, images_only=False):
192 if 'ancillary_data' not in annotation.keys():
193 self.working_records.append(annotation)
194 self.sort_records(self.process_working_records())
196 def find_id_refs_different_concept_name(self):
197 """
198 Finds annotations with the same ID reference that have different concept names
199 """
200 for name in self.sequence_names:
201 id_ref_names = {} # dict of {id_ref: {name_1, name_2}} to check for more than one name
202 id_ref_annotations = {} # dict of all annotations per id_ref: {id_ref: [annotation_1, annotation_2]}
203 for annotation in self.fetch_media_and_annotations(name, images_only=False):
204 if annotation.get('group') == 'localization':
205 continue
206 for association in annotation['associations']:
207 if association['link_name'] == 'identity-reference':
208 if association['link_value'] not in id_ref_names.keys():
209 id_ref_names[association['link_value']] = set()
210 id_ref_annotations[association['link_value']] = []
211 id_ref_names[association['link_value']].add(annotation['concept'])
212 id_ref_annotations[association['link_value']].append(annotation)
213 break
214 for id_ref, name_set in id_ref_names.items():
215 if len(name_set) > 1:
216 for annotation in id_ref_annotations[id_ref]:
217 self.working_records.append(annotation)
218 self.sort_records(self.process_working_records())
220 def find_id_refs_conflicting_associations(self):
221 """
222 Finds annotations with the same ID reference that have conflicting associations
223 """
224 to_concepts = ['s1', 's2', 'upon', 'size', 'habitat', 'megahabitat', 'sampled-by']
225 for name in self.sequence_names:
226 id_ref_associations = {} # dict of {id_ref: {ass_1_name: ass_1_val, ass_2_name: ass_2_val}}
227 id_ref_annotations = {} # dict of all annotations per id_ref: {id_ref: [annotation_1, annotation_2]}
228 for annotation in self.fetch_media_and_annotations(name, images_only=False):
229 if annotation.get('group') == 'localization':
230 continue
231 id_ref = get_association(annotation, 'identity-reference')
232 if id_ref:
233 current_id_ref = id_ref['link_value']
234 if current_id_ref not in id_ref_associations.keys():
235 id_ref_associations[current_id_ref] = {
236 'flag': False, # we'll set this to true if we find any conflicting associations
237 's2': set(), # s2, sampled-by, and sample-reference are allowed to have
238 'sampled-by': set(), # more than one association
239 'sample-reference': set(),
240 }
241 id_ref_annotations[current_id_ref] = [annotation]
242 # populate id_ref dict with all associations
243 for ass in annotation['associations']:
244 if ass['link_name'] == 'guide-photo':
245 pass
246 elif ass['link_name'] == 's2' or ass['link_name'] == 'sampled-by':
247 id_ref_associations[current_id_ref][ass['link_name']].add(ass['to_concept'])
248 elif ass['link_name'] == 'sample-reference':
249 id_ref_associations[current_id_ref][ass['link_name']].add(ass['link_value'])
250 else:
251 id_ref_associations[current_id_ref][ass['link_name']] = \
252 ass['link_value'] if ass['link_name'] not in to_concepts else ass['to_concept']
253 else:
254 # check current association values vs those saved
255 id_ref_annotations[current_id_ref].append(annotation)
256 temp_s2_set = set()
257 temp_sampled_by_set = set()
258 temp_sample_ref_set = set()
259 for ass in annotation['associations']:
260 if ass['link_name'] == 'guide-photo':
261 pass
262 elif ass['link_name'] == 's2':
263 temp_s2_set.add(ass['to_concept'])
264 elif ass['link_name'] == 'sampled-by':
265 temp_sampled_by_set.add(ass['to_concept'])
266 elif ass['link_name'] == 'sample-reference':
267 temp_sample_ref_set.add(ass['link_value'])
268 else:
269 if ass['link_name'] in to_concepts:
270 if ass['link_name'] in id_ref_associations[current_id_ref].keys():
271 # cases like 'guide-photo' will only be present on one record
272 if id_ref_associations[current_id_ref][ass['link_name']] != ass['to_concept']:
273 id_ref_associations[current_id_ref]['flag'] = True
274 break
275 else:
276 id_ref_associations[current_id_ref][ass['link_name']] = ass['to_concept']
277 else:
278 if ass['link_name'] in id_ref_associations[current_id_ref].keys():
279 if id_ref_associations[current_id_ref][ass['link_name']] != ass['link_value']:
280 id_ref_associations[current_id_ref]['flag'] = True
281 break
282 else:
283 id_ref_associations[current_id_ref][ass['link_name']] = ass['link_value']
284 if temp_s2_set != id_ref_associations[current_id_ref]['s2'] \
285 or temp_sampled_by_set != id_ref_associations[current_id_ref]['sampled-by'] \
286 or temp_sample_ref_set != id_ref_associations[current_id_ref]['sample-reference']:
287 id_ref_associations[current_id_ref]['flag'] = True
288 for id_ref in id_ref_associations.keys():
289 if id_ref_associations[id_ref]['flag']:
290 for annotation in id_ref_annotations[id_ref]:
291 self.working_records.append(annotation)
292 self.sort_records(self.process_working_records())
294 def find_blank_associations(self):
295 """
296 Finds all records that have associations with a link value of ""
297 """
298 for name in self.sequence_names:
299 for annotation in self.fetch_media_and_annotations(name, images_only=False):
300 if annotation.get('group') == 'localization':
301 continue
302 for association in annotation['associations']:
303 if association['link_value'] == '' and association['to_concept'] == 'self':
304 self.working_records.append(annotation)
305 self.sort_records(self.process_working_records())
307 def find_suspicious_hosts(self):
308 """
309 Finds annotations that have an upon that is the same concept as itself
310 """
311 for name in self.sequence_names:
312 for annotation in self.fetch_media_and_annotations(name, images_only=False):
313 if annotation.get('group') == 'localization':
314 continue
315 upon = get_association(annotation, 'upon')
316 if upon and upon['to_concept'] == annotation['concept']:
317 self.working_records.append(annotation)
318 self.sort_records(self.process_working_records())
320 def find_missing_expected_association(self):
321 """
322 Finds annotations that are expected to be upon another organism, but are not. This is a very slow test because
323 before it can begin, we must retrieve the taxa from VARS for every record (unlike the other tests, we can't
324 filter beforehand).
326 If more concepts need to be added for this check, simply add them to the appropriate list below:
328 Example: To add the order 'order123' to the list, change the declaration below from:
330 orders = ['Comatulida']
332 to:
334 orders = ['Comatulida', 'order123']
336 If a list does not exist, declare a new list and add it to the conditional:
338 Example: To add the subfamily 'subfam123' to the check, add a new list named 'subfamilies':
340 subfamilies = ['subfam123']
342 Then add the new list to the conditional:
344 ...
345 or ('family' in record.keys() and record['family'] in families)
346 or ('subfamily' in record.keys() and record['subfamily'] in subfamilies) <<< ADD THIS LINE
347 or ('genus' in record.keys() and record['genus'] in genera)
348 ...
350 If you want the new addition to be highlighted in the table on the webpage, add the name to the ranksToHighlight
351 list in vars/qaqc.js, at ~line 340
352 """
353 classes = ['Ophiuroidea']
354 orders = ['Comatulida']
355 infraorders = ['Anomura', 'Caridea']
356 families = ['Goniasteridae', 'Poecilasmatidae', 'Parazoanthidae', 'Tubulariidae', 'Amphianthidae', 'Actinoscyphiidae']
357 genera = ['Henricia']
358 concepts = ['Hydroidolina']
359 for name in self.sequence_names:
360 for annotation in self.fetch_media_and_annotations(name, images_only=False):
361 if annotation.get('group') == 'localization':
362 continue
363 self.working_records.append(annotation)
364 self.sort_records(self.process_working_records())
365 temp_records = self.final_records
366 self.final_records = []
367 for record in temp_records:
368 if record.get('class') in classes \
369 or record.get('order') in orders \
370 or record.get('infraorder') in infraorders \
371 or record.get('family') in families \
372 or record.get('genus') in genera \
373 or record.get('concept') in concepts:
374 upon = get_association(record, 'upon')
375 if upon and upon['to_concept'][0].islower() and 'dead' not in upon['to_concept']:
376 self.final_records.append(record)
378 def find_long_host_associate_time_diff(self):
379 greater_than_one_min = {}
380 greater_than_five_mins = {}
381 not_found = []
382 for name in self.sequence_names:
383 sorted_annotations = sorted(self.fetch_media_and_annotations(name, images_only=False), key=lambda d: d['recorded_timestamp'])
384 for i in range(len(sorted_annotations)):
385 associate_record = sorted_annotations[i]
386 upon = get_association(sorted_annotations[i], 'upon')
387 if upon and upon['to_concept'] and upon['to_concept'][0].isupper():
388 # the associate's 'upon' is an organism
389 host_concept_name = upon['to_concept']
390 observation_time = extract_recorded_datetime(associate_record)
391 found = False
392 for j in range(i + 10, -1, -1):
393 """
394 Checks backward, looking for the most recent host w/ matching name. We start at i + 10 because
395 there can be multiple records with the exact same timestamp, and one of those records could be
396 the 'upon'
397 """
398 # to catch index out of range exception
399 while j >= len(sorted_annotations):
400 j -= 1
401 host_record = sorted_annotations[j]
402 host_time = extract_recorded_datetime(host_record)
403 if host_time > observation_time or i == j:
404 # host record won't be recorded after associate record, ignore this record
405 # i == j: record shouldn't be associated with itself, ignore
406 pass
407 else:
408 if host_record['concept'] == host_concept_name:
409 # the host record's name is equal to the host concept name (associate's 'upon' name)
410 found = True
411 time_diff = observation_time - host_time
412 if time_diff.seconds > 300:
413 greater_than_five_mins[associate_record['observation_uuid']] = time_diff
414 self.working_records.append(associate_record)
415 elif time_diff.seconds > 60:
416 greater_than_one_min[associate_record['observation_uuid']] = time_diff
417 self.working_records.append(associate_record)
418 break
419 if not found:
420 not_found.append(associate_record['observation_uuid'])
421 self.working_records.append(associate_record)
422 self.sort_records(self.process_working_records())
423 for uuid in greater_than_one_min.keys():
424 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \
425 'Time between record and closest previous matching host record greater than one minute ' \
426 f'({greater_than_one_min[uuid].seconds} seconds)'
427 for uuid in greater_than_five_mins.keys():
428 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \
429 'Time between record and closest previous matching host record greater than five minutes ' \
430 f'({greater_than_five_mins[uuid].seconds // 60 % 60} mins, {greater_than_five_mins[uuid].seconds % 60} seconds)'
431 for uuid in not_found:
432 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \
433 'Host not found in previous records'
435 def find_num_bounding_boxes(self):
436 """
437 Finds the number of bounding boxes and total annotation count for each unique concept.
438 """
439 bounding_box_counts = {}
440 total_count_annos = 0
441 total_count_boxes = 0
442 for name in self.sequence_names:
443 for annotation in self.fetch_media_and_annotations(name, images_only=False):
444 total_count_annos += 1
445 if annotation['concept'] not in bounding_box_counts.keys():
446 bounding_box_counts[annotation['concept']] = {
447 'boxes': 0,
448 'annos': 0,
449 }
450 bounding_box_counts[annotation['concept']]['annos'] += 1
451 if get_association(annotation, 'bounding box'):
452 total_count_boxes += 1
453 bounding_box_counts[annotation['concept']]['boxes'] += 1
454 sorted_box_counts = dict(sorted(bounding_box_counts.items()))
455 self.final_records.append({
456 'total_count_annos': total_count_annos,
457 'total_count_boxes': total_count_boxes,
458 'bounding_box_counts': sorted_box_counts,
459 })
461 def find_localizations_without_bounding_boxes(self):
462 """
463 Finds records that are in the "localization" group but do not contain a bounding box association. Also finds
464 records that have a bounding box association but are not in the "localization" group.
465 """
466 for name in self.sequence_names:
467 for annotation in self.fetch_media_and_annotations(name, images_only=False):
468 has_box = False
469 for association in annotation['associations']:
470 if association['link_name'] == 'bounding box':
471 has_box = True
472 break
473 if annotation.get('group') == 'localization':
474 if not has_box:
475 self.working_records.append(annotation)
476 elif has_box:
477 self.working_records.append(annotation)
478 self.sort_records(self.process_working_records())
480 def find_unique_fields(self):
481 def load_dict(field_name, unique_dict, individual_count):
482 if field_name not in unique_dict.keys():
483 unique_dict[field_name] = {}
484 unique_dict[field_name]['records'] = 1
485 unique_dict[field_name]['individuals'] = individual_count
486 else:
487 unique_dict[field_name]['records'] += 1
488 unique_dict[field_name]['individuals'] += individual_count
490 unique_concept_names = {}
491 unique_concept_upons = {}
492 unique_substrate_combinations = {}
493 unique_comments = {}
494 unique_condition_comments = {}
495 unique_megahabitats = {}
496 unique_habitats = {}
497 unique_habitat_comments = {}
498 unique_id_certainty = {}
499 unique_occurrence_remarks = {}
501 for name in self.sequence_names:
502 for annotation in self.fetch_media_and_annotations(name, images_only=False):
503 substrates = []
504 upon = None
505 comment = None
506 condition_comment = None
507 megahabitat = None
508 habitat = None
509 habitat_comment = None
510 id_certainty = None
511 occurrence_remark = None
512 individual_count = 1
514 for association in annotation['associations']:
515 match association['link_name']:
516 case 's1' | 's2':
517 substrates.append(association['to_concept'])
518 case 'upon':
519 upon = association['to_concept']
520 case 'comment':
521 comment = association['link_value']
522 case 'condition-comment':
523 condition_comment = association['link_value']
524 case 'megahabitat':
525 megahabitat = association['to_concept']
526 case 'habitat':
527 habitat = association['to_concept']
528 case 'habitat-comment':
529 habitat_comment = association['link_value']
530 case 'identity-certainty':
531 id_certainty = association['link_value']
532 case 'occurrence-remark':
533 occurrence_remark = association['link_value']
534 case 'population-quantity':
535 if association['link_value'] != '':
536 individual_count = int(association['link_value'])
537 case 'categorical-abundance':
538 match association['link_value']:
539 case '11-20':
540 individual_count = 15
541 case '21-50':
542 individual_count = 35
543 case '51-100':
544 individual_count = 75
545 case '\u003e100':
546 individual_count = 100
548 if substrates is not None:
549 substrates.sort()
550 substrates = ', '.join(substrates)
552 load_dict(annotation['concept'], unique_concept_names, individual_count)
553 load_dict(f'{annotation["concept"]}:{upon}', unique_concept_upons, individual_count)
554 load_dict(substrates, unique_substrate_combinations, individual_count)
555 load_dict(comment, unique_comments, individual_count)
556 load_dict(condition_comment, unique_condition_comments, individual_count)
557 load_dict(megahabitat, unique_megahabitats, individual_count)
558 load_dict(habitat, unique_habitats, individual_count)
559 load_dict(habitat_comment, unique_habitat_comments, individual_count)
560 load_dict(id_certainty, unique_id_certainty, individual_count)
561 load_dict(occurrence_remark, unique_occurrence_remarks, individual_count)
563 self.final_records.append({'concept-names': unique_concept_names})
564 self.final_records.append({'concept-upon-combinations': unique_concept_upons})
565 self.final_records.append({'substrate-combinations': unique_substrate_combinations})
566 self.final_records.append({'comments': unique_comments})
567 self.final_records.append({'condition-comments': unique_condition_comments})
568 self.final_records.append({'megahabitats': unique_megahabitats})
569 self.final_records.append({'habitats': unique_habitats})
570 self.final_records.append({'habitat-comments': unique_habitat_comments})
571 self.final_records.append({'identity-certainty': unique_id_certainty})
572 self.final_records.append({'occurrence-remarks': unique_occurrence_remarks})