Coverage for application/vars/vars_qaqc

1from application.util.functions import extract_recorded_datetime, get_association

2from application.vars.vars_annotation_processor import VarsAnnotationProcessor

5class VarsQaqcProcessor(VarsAnnotationProcessor):

6 """

7 Filters and formats annotations for the various DARC QA/QC checks.

8 """

10 def __init__(self, sequence_names: list, vars_charybdis_url: str, vars_kb_url: str):

11 super().__init__(sequence_names, vars_charybdis_url, vars_kb_url)

13 def find_duplicate_associations(self):

14 """

15 Finds annotations that have more than one of the same association besides s2

16 """

17 for name in self.sequence_names:

18 for annotation in self.fetch_media_and_annotations(name, images_only=False):

19 if annotation.get('group') == 'localization':

20 continue

21 # get list of associations

22 association_set = set()

23 duplicate_associations = False

24 for association in annotation['associations']:

25 name = association['link_name']

26 if name not in association_set:

27 if name != 's2':

28 association_set.add(name)

29 else:

30 duplicate_associations = True

31 break

32 if duplicate_associations:

33 self.working_records.append(annotation)

34 self.sort_records(self.process_working_records())

36 def find_missing_s1(self):

37 """

38 Finds annotations that are missing s1 (ignores 'none' records)

39 """

40 for name in self.sequence_names:

41 for annotation in self.fetch_media_and_annotations(name, images_only=False):

42 if annotation['concept'] == 'none' or annotation.get('group') == 'localization':

43 continue

44 s1 = get_association(annotation, 's1')

45 if not s1:

46 self.working_records.append(annotation)

47 self.sort_records(self.process_working_records())

49 def find_identical_s1_s2(self):

50 """

51 Finds annotations that have an s2 association that is the same as its s1 association

52 """

53 for name in self.sequence_names:

54 for annotation in self.fetch_media_and_annotations(name, images_only=False):

55 if annotation.get('group') == 'localization':

56 continue

57 s2s = []

58 s1 = ''

59 for association in annotation['associations']:

60 if association['link_name'] == 's1':

61 s1 = association['to_concept']

62 elif association['link_name'] == 's2':

63 s2s.append(association['to_concept'])

64 if s1 in s2s:

65 self.working_records.append(annotation)

66 self.sort_records(self.process_working_records())

68 def find_duplicate_s2(self):

69 """

70 Finds annotations that have multiple s2 associations with the same value

71 """

72 for name in self.sequence_names:

73 for annotation in self.fetch_media_and_annotations(name, images_only=False):

74 if annotation.get('group') == 'localization':

75 continue

76 duplicate_s2s = False

77 s2_set = set()

78 for association in annotation['associations']:

79 if association['link_name'] == 's2':

80 if association['to_concept'] in s2_set:

81 duplicate_s2s = True

82 break

83 else:

84 s2_set.add(association['to_concept'])

85 if duplicate_s2s:

86 self.working_records.append(annotation)

87 self.sort_records(self.process_working_records())

89 def find_missing_upon_substrate(self):

90 """

91 Finds annotations that have an upon association that is not an organism, but the 'upon' is not present in s1 or

92 any s2

93 """

94 for name in self.sequence_names:

95 for annotation in self.fetch_media_and_annotations(name, images_only=False):

96 if annotation.get('group') == 'localization':

97 continue

98 upon = None

99 missing_upon = False

100 for association in annotation['associations']:

101 if association['link_name'] == 'upon':

102 if (association['to_concept'] and association['to_concept'][0].isupper()) \

103 or association['to_concept'].startswith('dead'):

104 # 'upon' is an organism, don't need it to be in s1/s2

105 pass

106 else:

107 # 'upon' should be in s1 or s2

108 upon = association['to_concept']

109 break

110 if upon:

111 missing_upon = True

112 for association in annotation['associations']:

113 if (association['link_name'] == 's1' or association['link_name'] == 's2') \

114 and association['to_concept'] == upon:

115 missing_upon = False

116 break

117 if missing_upon:

118 self.working_records.append(annotation)

119 self.sort_records(self.process_working_records())

120

121 def find_mismatched_substrates(self):

122 """

123 Finds annotations that occur at the same timestamp (same second) but have different substrates

124 """

125 for name in self.sequence_names:

126 annotations_with_same_timestamp = {}

127 sorted_annotations = sorted(self.fetch_media_and_annotations(name, images_only=False), key=lambda d: d['recorded_timestamp'])

128 # loop through all annotations, add ones with same timestamp to dict

129 i = 0

130 while i < len(sorted_annotations) - 2:

131 if sorted_annotations[i].get('group') == 'localization':

132 i += 1

133 continue

134 base_timestamp = sorted_annotations[i]['recorded_timestamp'][:19]

135 base_annotation = sorted_annotations[i]

136 i += 1

137 while i < len(sorted_annotations) and sorted_annotations[i]['recorded_timestamp'][:19] == base_timestamp:

138 if sorted_annotations[i].get('group') != 'localization':

139 if base_timestamp not in annotations_with_same_timestamp.keys():

140 annotations_with_same_timestamp[base_timestamp] = [base_annotation]

141 annotations_with_same_timestamp[base_timestamp].append(sorted_annotations[i])

142 i += 1

143 # loop through each annotation that shares the same timestamp, compare substrates

144 for timestamp_key in annotations_with_same_timestamp.keys():

145 base_substrates = {'s2': set()}

146 check_substrates = {'s2': set()}

147 for association in annotations_with_same_timestamp[timestamp_key][0]['associations']:

148 if association['link_name'] == 's1':

149 base_substrates['s1'] = association['to_concept']

150 if association['link_name'] == 's2':

151 base_substrates['s2'].add(association['to_concept'])

152 for i in range(1, len(annotations_with_same_timestamp[timestamp_key])):

153 for association in annotations_with_same_timestamp[timestamp_key][i]['associations']:

154 if association['link_name'] == 's1':

155 check_substrates['s1'] = association['to_concept']

156 if association['link_name'] == 's2':

157 check_substrates['s2'].add(association['to_concept'])

158 if base_substrates != check_substrates:

159 for annotation in annotations_with_same_timestamp[timestamp_key]:

160 self.working_records.append(annotation)

161 self.sort_records(self.process_working_records())

162

163 def find_missing_upon(self):

164 """

165 Finds annotations that are missing upon (ignores 'none' records)

166 """

167 for name in self.sequence_names:

168 for annotation in self.fetch_media_and_annotations(name, images_only=False):

169 if annotation['concept'] == 'none' or annotation.get('group') == 'localization':

170 continue

171 if not get_association(annotation, 'upon'):

172 self.working_records.append(annotation)

173 self.sort_records(self.process_working_records())

174

175 def get_num_records_missing_ancillary_data(self):

176 """

177 Finds number of annotations that are missing ancillary data

178 """

179 num_records_missing = 0

180 for name in self.sequence_names:

181 for annotation in self.fetch_media_and_annotations(name, images_only=False):

182 if 'ancillary_data' not in annotation.keys():

183 num_records_missing += 1

184 return num_records_missing

185

186 def find_missing_ancillary_data(self):

187 """

188 Finds annotations that are missing ancillary data (can be very slow)

189 """

190 for name in self.sequence_names:

191 for annotation in self.fetch_media_and_annotations(name, images_only=False):

192 if 'ancillary_data' not in annotation.keys():

193 self.working_records.append(annotation)

194 self.sort_records(self.process_working_records())

195

196 def find_id_refs_different_concept_name(self):

197 """

198 Finds annotations with the same ID reference that have different concept names

199 """

200 for name in self.sequence_names:

201 id_ref_names = {} # dict of {id_ref: {name_1, name_2}} to check for more than one name

202 id_ref_annotations = {} # dict of all annotations per id_ref: {id_ref: [annotation_1, annotation_2]}

203 for annotation in self.fetch_media_and_annotations(name, images_only=False):

204 if annotation.get('group') == 'localization':

205 continue

206 for association in annotation['associations']:

207 if association['link_name'] == 'identity-reference':

208 if association['link_value'] not in id_ref_names.keys():

209 id_ref_names[association['link_value']] = set()

210 id_ref_annotations[association['link_value']] = []

211 id_ref_names[association['link_value']].add(annotation['concept'])

212 id_ref_annotations[association['link_value']].append(annotation)

213 break

214 for id_ref, name_set in id_ref_names.items():

215 if len(name_set) > 1:

216 for annotation in id_ref_annotations[id_ref]:

217 self.working_records.append(annotation)

218 self.sort_records(self.process_working_records())

219

220 def find_id_refs_conflicting_associations(self):

221 """

222 Finds annotations with the same ID reference that have conflicting associations

223 """

224 to_concepts = ['s1', 's2', 'upon', 'size', 'habitat', 'megahabitat', 'sampled-by']

225 for name in self.sequence_names:

226 id_ref_associations = {} # dict of {id_ref: {ass_1_name: ass_1_val, ass_2_name: ass_2_val}}

227 id_ref_annotations = {} # dict of all annotations per id_ref: {id_ref: [annotation_1, annotation_2]}

228 for annotation in self.fetch_media_and_annotations(name, images_only=False):

229 if annotation.get('group') == 'localization':

230 continue

231 id_ref = get_association(annotation, 'identity-reference')

232 if id_ref:

233 current_id_ref = id_ref['link_value']

234 if current_id_ref not in id_ref_associations.keys():

235 id_ref_associations[current_id_ref] = {

236 'flag': False, # we'll set this to true if we find any conflicting associations

237 's2': set(), # s2, sampled-by, and sample-reference are allowed to have

238 'sampled-by': set(), # more than one association

239 'sample-reference': set(),

240 }

241 id_ref_annotations[current_id_ref] = [annotation]

242 # populate id_ref dict with all associations

243 for ass in annotation['associations']:

244 if ass['link_name'] == 'guide-photo':

245 pass

246 elif ass['link_name'] == 's2' or ass['link_name'] == 'sampled-by':

247 id_ref_associations[current_id_ref][ass['link_name']].add(ass['to_concept'])

248 elif ass['link_name'] == 'sample-reference':

249 id_ref_associations[current_id_ref][ass['link_name']].add(ass['link_value'])

250 else:

251 id_ref_associations[current_id_ref][ass['link_name']] = \

252 ass['link_value'] if ass['link_name'] not in to_concepts else ass['to_concept']

253 else:

254 # check current association values vs those saved

255 id_ref_annotations[current_id_ref].append(annotation)

256 temp_s2_set = set()

257 temp_sampled_by_set = set()

258 temp_sample_ref_set = set()

259 for ass in annotation['associations']:

260 if ass['link_name'] == 'guide-photo':

261 pass

262 elif ass['link_name'] == 's2':

263 temp_s2_set.add(ass['to_concept'])

264 elif ass['link_name'] == 'sampled-by':

265 temp_sampled_by_set.add(ass['to_concept'])

266 elif ass['link_name'] == 'sample-reference':

267 temp_sample_ref_set.add(ass['link_value'])

268 else:

269 if ass['link_name'] in to_concepts:

270 if ass['link_name'] in id_ref_associations[current_id_ref].keys():

271 # cases like 'guide-photo' will only be present on one record

272 if id_ref_associations[current_id_ref][ass['link_name']] != ass['to_concept']:

273 id_ref_associations[current_id_ref]['flag'] = True

274 break

275 else:

276 id_ref_associations[current_id_ref][ass['link_name']] = ass['to_concept']

277 else:

278 if ass['link_name'] in id_ref_associations[current_id_ref].keys():

279 if id_ref_associations[current_id_ref][ass['link_name']] != ass['link_value']:

280 id_ref_associations[current_id_ref]['flag'] = True

281 break

282 else:

283 id_ref_associations[current_id_ref][ass['link_name']] = ass['link_value']

284 if temp_s2_set != id_ref_associations[current_id_ref]['s2'] \

285 or temp_sampled_by_set != id_ref_associations[current_id_ref]['sampled-by'] \

286 or temp_sample_ref_set != id_ref_associations[current_id_ref]['sample-reference']:

287 id_ref_associations[current_id_ref]['flag'] = True

288 for id_ref in id_ref_associations.keys():

289 if id_ref_associations[id_ref]['flag']:

290 for annotation in id_ref_annotations[id_ref]:

291 self.working_records.append(annotation)

292 self.sort_records(self.process_working_records())

293

294 def find_blank_associations(self):

295 """

296 Finds all records that have associations with a link value of ""

297 """

298 for name in self.sequence_names:

299 for annotation in self.fetch_media_and_annotations(name, images_only=False):

300 if annotation.get('group') == 'localization':

301 continue

302 for association in annotation['associations']:

303 if association['link_value'] == '' and association['to_concept'] == 'self':

304 self.working_records.append(annotation)

305 self.sort_records(self.process_working_records())

306

307 def find_suspicious_hosts(self):

308 """

309 Finds annotations that have an upon that is the same concept as itself

310 """

311 for name in self.sequence_names:

312 for annotation in self.fetch_media_and_annotations(name, images_only=False):

313 if annotation.get('group') == 'localization':

314 continue

315 upon = get_association(annotation, 'upon')

316 if upon and upon['to_concept'] == annotation['concept']:

317 self.working_records.append(annotation)

318 self.sort_records(self.process_working_records())

319

320 def find_missing_expected_association(self):

321 """

322 Finds annotations that are expected to be upon another organism, but are not. This is a very slow test because

323 before it can begin, we must retrieve the taxa from VARS for every record (unlike the other tests, we can't

324 filter beforehand).

325

326 If more concepts need to be added for this check, simply add them to the appropriate list below:

327

328 Example: To add the order 'order123' to the list, change the declaration below from:

329

330 orders = ['Comatulida']

331

332 to:

333

334 orders = ['Comatulida', 'order123']

335

336 If a list does not exist, declare a new list and add it to the conditional:

337

338 Example: To add the subfamily 'subfam123' to the check, add a new list named 'subfamilies':

339

340 subfamilies = ['subfam123']

341

342 Then add the new list to the conditional:

343

344 ...

345 or ('family' in record.keys() and record['family'] in families)

346 or ('subfamily' in record.keys() and record['subfamily'] in subfamilies) <<< ADD THIS LINE

347 or ('genus' in record.keys() and record['genus'] in genera)

348 ...

349

350 If you want the new addition to be highlighted in the table on the webpage, add the name to the ranksToHighlight

351 list in vars/qaqc.js, at ~line 340

352 """

353 classes = ['Ophiuroidea']

354 orders = ['Comatulida']

355 infraorders = ['Anomura', 'Caridea']

356 families = ['Goniasteridae', 'Poecilasmatidae', 'Parazoanthidae', 'Tubulariidae', 'Amphianthidae', 'Actinoscyphiidae']

357 genera = ['Henricia']

358 concepts = ['Hydroidolina']

359 for name in self.sequence_names:

360 for annotation in self.fetch_media_and_annotations(name, images_only=False):

361 if annotation.get('group') == 'localization':

362 continue

363 self.working_records.append(annotation)

364 self.sort_records(self.process_working_records())

365 temp_records = self.final_records

366 self.final_records = []

367 for record in temp_records:

368 if record.get('class') in classes \

369 or record.get('order') in orders \

370 or record.get('infraorder') in infraorders \

371 or record.get('family') in families \

372 or record.get('genus') in genera \

373 or record.get('concept') in concepts:

374 upon = get_association(record, 'upon')

375 if upon and upon['to_concept'][0].islower() and 'dead' not in upon['to_concept']:

376 self.final_records.append(record)

377

378 def find_long_host_associate_time_diff(self):

379 greater_than_one_min = {}

380 greater_than_five_mins = {}

381 not_found = []

382 for name in self.sequence_names:

383 sorted_annotations = sorted(self.fetch_media_and_annotations(name, images_only=False), key=lambda d: d['recorded_timestamp'])

384 for i in range(len(sorted_annotations)):

385 associate_record = sorted_annotations[i]

386 upon = get_association(sorted_annotations[i], 'upon')

387 if upon and upon['to_concept'] and upon['to_concept'][0].isupper():

388 # the associate's 'upon' is an organism

389 host_concept_name = upon['to_concept']

390 observation_time = extract_recorded_datetime(associate_record)

391 found = False

392 for j in range(i + 10, -1, -1):

393 """

394 Checks backward, looking for the most recent host w/ matching name. We start at i + 10 because

395 there can be multiple records with the exact same timestamp, and one of those records could be

396 the 'upon'

397 """

398 # to catch index out of range exception

399 while j >= len(sorted_annotations):

400 j -= 1

401 host_record = sorted_annotations[j]

402 host_time = extract_recorded_datetime(host_record)

403 if host_time > observation_time or i == j:

404 # host record won't be recorded after associate record, ignore this record

405 # i == j: record shouldn't be associated with itself, ignore

406 pass

407 else:

408 if host_record['concept'] == host_concept_name:

409 # the host record's name is equal to the host concept name (associate's 'upon' name)

410 found = True

411 time_diff = observation_time - host_time

412 if time_diff.seconds > 300:

413 greater_than_five_mins[associate_record['observation_uuid']] = time_diff

414 self.working_records.append(associate_record)

415 elif time_diff.seconds > 60:

416 greater_than_one_min[associate_record['observation_uuid']] = time_diff

417 self.working_records.append(associate_record)

418 break

419 if not found:

420 not_found.append(associate_record['observation_uuid'])

421 self.working_records.append(associate_record)

422 self.sort_records(self.process_working_records())

423 for uuid in greater_than_one_min.keys():

424 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \

425 'Time between record and closest previous matching host record greater than one minute ' \

426 f'({greater_than_one_min[uuid].seconds} seconds)'

427 for uuid in greater_than_five_mins.keys():

428 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \

429 'Time between record and closest previous matching host record greater than five minutes ' \

430 f'({greater_than_five_mins[uuid].seconds // 60 % 60} mins, {greater_than_five_mins[uuid].seconds % 60} seconds)'

431 for uuid in not_found:

432 next((x for x in self.final_records if x['observation_uuid'] == uuid), None)['status'] = \

433 'Host not found in previous records'

434

435 def find_num_bounding_boxes(self):

436 """

437 Finds the number of bounding boxes and total annotation count for each unique concept.

438 """

439 bounding_box_counts = {}

440 total_count_annos = 0

441 total_count_boxes = 0

442 for name in self.sequence_names:

443 for annotation in self.fetch_media_and_annotations(name, images_only=False):

444 total_count_annos += 1

445 if annotation['concept'] not in bounding_box_counts.keys():

446 bounding_box_counts[annotation['concept']] = {

447 'boxes': 0,

448 'annos': 0,

449 }

450 bounding_box_counts[annotation['concept']]['annos'] += 1

451 if get_association(annotation, 'bounding box'):

452 total_count_boxes += 1

453 bounding_box_counts[annotation['concept']]['boxes'] += 1

454 sorted_box_counts = dict(sorted(bounding_box_counts.items()))

455 self.final_records.append({

456 'total_count_annos': total_count_annos,

457 'total_count_boxes': total_count_boxes,

458 'bounding_box_counts': sorted_box_counts,

459 })

460

461 def find_localizations_without_bounding_boxes(self):

462 """

463 Finds records that are in the "localization" group but do not contain a bounding box association. Also finds

464 records that have a bounding box association but are not in the "localization" group.

465 """

466 for name in self.sequence_names:

467 for annotation in self.fetch_media_and_annotations(name, images_only=False):

468 has_box = False

469 for association in annotation['associations']:

470 if association['link_name'] == 'bounding box':

471 has_box = True

472 break

473 if annotation.get('group') == 'localization':

474 if not has_box:

475 self.working_records.append(annotation)

476 elif has_box:

477 self.working_records.append(annotation)

478 self.sort_records(self.process_working_records())

479

480 def find_unique_fields(self):

481 def load_dict(field_name, unique_dict, individual_count):

482 if field_name not in unique_dict.keys():

483 unique_dict[field_name] = {}

484 unique_dict[field_name]['records'] = 1

485 unique_dict[field_name]['individuals'] = individual_count

486 else:

487 unique_dict[field_name]['records'] += 1

488 unique_dict[field_name]['individuals'] += individual_count

489

490 unique_concept_names = {}

491 unique_concept_upons = {}

492 unique_substrate_combinations = {}

493 unique_comments = {}

494 unique_condition_comments = {}

495 unique_megahabitats = {}

496 unique_habitats = {}

497 unique_habitat_comments = {}

498 unique_id_certainty = {}

499 unique_occurrence_remarks = {}

500

501 for name in self.sequence_names:

502 for annotation in self.fetch_media_and_annotations(name, images_only=False):

503 substrates = []

504 upon = None

505 comment = None

506 condition_comment = None

507 megahabitat = None

508 habitat = None

509 habitat_comment = None

510 id_certainty = None

511 occurrence_remark = None

512 individual_count = 1

513

514 for association in annotation['associations']:

515 match association['link_name']:

516 case 's1' | 's2':

517 substrates.append(association['to_concept'])

518 case 'upon':

519 upon = association['to_concept']

520 case 'comment':

521 comment = association['link_value']

522 case 'condition-comment':

523 condition_comment = association['link_value']

524 case 'megahabitat':

525 megahabitat = association['to_concept']

526 case 'habitat':

527 habitat = association['to_concept']

528 case 'habitat-comment':

529 habitat_comment = association['link_value']

530 case 'identity-certainty':

531 id_certainty = association['link_value']

532 case 'occurrence-remark':

533 occurrence_remark = association['link_value']

534 case 'population-quantity':

535 if association['link_value'] != '':

536 individual_count = int(association['link_value'])

537 case 'categorical-abundance':

538 match association['link_value']:

539 case '11-20':

540 individual_count = 15

541 case '21-50':

542 individual_count = 35

543 case '51-100':

544 individual_count = 75

545 case '\u003e100':

546 individual_count = 100

547

548 if substrates is not None:

549 substrates.sort()

550 substrates = ', '.join(substrates)

551

552 load_dict(annotation['concept'], unique_concept_names, individual_count)

553 load_dict(f'{annotation["concept"]}:{upon}', unique_concept_upons, individual_count)

554 load_dict(substrates, unique_substrate_combinations, individual_count)

555 load_dict(comment, unique_comments, individual_count)

556 load_dict(condition_comment, unique_condition_comments, individual_count)

557 load_dict(megahabitat, unique_megahabitats, individual_count)

558 load_dict(habitat, unique_habitats, individual_count)

559 load_dict(habitat_comment, unique_habitat_comments, individual_count)

560 load_dict(id_certainty, unique_id_certainty, individual_count)

561 load_dict(occurrence_remark, unique_occurrence_remarks, individual_count)

562

563 self.final_records.append({'concept-names': unique_concept_names})

564 self.final_records.append({'concept-upon-combinations': unique_concept_upons})

565 self.final_records.append({'substrate-combinations': unique_substrate_combinations})

566 self.final_records.append({'comments': unique_comments})

567 self.final_records.append({'condition-comments': unique_condition_comments})

568 self.final_records.append({'megahabitats': unique_megahabitats})

569 self.final_records.append({'habitats': unique_habitats})

570 self.final_records.append({'habitat-comments': unique_habitat_comments})

571 self.final_records.append({'identity-certainty': unique_id_certainty})

572 self.final_records.append({'occurrence-remarks': unique_occurrence_remarks})

Coverage for application / vars / vars_qaqc_processor.py: 88%

408 statements