Coverage for application/image_review/external_review/comment_processor.py: 8%
164 statements
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-23 02:22 +0000
« prev ^ index » next coverage.py v7.9.1, created at 2025-06-23 02:22 +0000
1import json
2import os
3import pandas as pd
4import numpy as np
5import requests
6import sys
8from flask import session
9from json import JSONDecodeError
11from application.util.functions import *
12from application.util.constants import TERM_RED, TERM_YELLOW, TERM_NORMAL
15class CommentProcessor:
16 """
17 Fetches annotation information from the VARS db on HURLSTOR and Tator given a dict of comments (key = uuid). Merges
18 fetched annotations with the data in the comment dict into an array of dicts (self.annotations).
19 """
20 def __init__(self, comments: Dict, annosaurus_url: str, vars_phylogeny_url: str, tator_localizations_url: str):
21 self.comments = comments
22 self.annosaurus_url = annosaurus_url
23 self.vars_phylogeny_url = vars_phylogeny_url
24 self.tator_localizations_url = tator_localizations_url
25 self.distilled_records = []
26 self.missing_records = []
27 self.no_match_records = set()
28 self.load_comments()
30 def load_comments(self):
31 formatted_comments = []
32 no_match_records = set()
34 try:
35 with open(os.path.join('cache', 'phylogeny.json'), 'r') as f:
36 phylogeny = json.load(f)
37 except FileNotFoundError:
38 phylogeny = {'Animalia': {}}
40 print(f'Processing {len(self.comments)} comments...', end='')
41 sys.stdout.flush()
43 # get all the tator localizations first, because each tator call takes forever
44 media_ids = set()
45 localizations = []
46 if session.get('tator_token'):
47 for comment in self.comments:
48 if 'all_localizations' in self.comments[comment].keys() and self.comments[comment]['all_localizations'] is not None:
49 # get the media id from the video url (not stored as its own field)
50 media_id = self.comments[comment]['video_url'].split('/')[-1].split('&')[0]
51 media_ids.add(media_id)
52 for i in range(0, len(media_ids), 300): # just get all localizations for each media id
53 chunk = list(media_ids)[i:i + 300]
54 # fixme (?) vvvv potential bug using hardcoded "26" as project id (but probably fine) vvvv
55 get_localization_res = requests.get(
56 url=f'{self.tator_localizations_url}/26?media_id={",".join(map(str, chunk))}',
57 headers={
58 'Content-Type': 'application/json',
59 'Authorization': f'Token {session["tator_token"]}',
60 })
61 localizations += get_localization_res.json()
63 # add formatted comments to list
64 for comment in self.comments:
65 concept_name = None
66 comment_dict = {
67 'observation_uuid': comment,
68 'image_url': self.comments[comment].get('image_url'),
69 'video_url': self.comments[comment].get('video_url'),
70 'video_sequence_name': self.comments[comment]['sequence'],
71 }
73 if 'all_localizations' not in self.comments[comment].keys()\
74 or self.comments[comment]['all_localizations'] is None\
75 or self.comments[comment]['all_localizations'] == '':
76 # vars annotation
77 guide_photo = None
78 upon = None
79 identity_certainty = None
80 identity_reference = None
81 depth = None
82 vars_comment = None
83 vars_res = requests.get(url=f'{self.annosaurus_url}/annotations/{comment}')
84 try:
85 annotation = vars_res.json()
86 concept_name = annotation['concept']
87 except (JSONDecodeError, KeyError):
88 problem_comment = self.comments[comment]
89 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in VARS ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')
90 self.missing_records.append(problem_comment)
91 continue
92 if annotation.get('associations'):
93 for association in annotation['associations']:
94 if association['link_name'] == 'identity-certainty':
95 identity_certainty = association['link_value']
96 elif association['link_name'] == 'identity-reference':
97 identity_reference = association['link_value']
98 elif association['link_name'] == 'guide-photo':
99 guide_photo = association['to_concept']
100 elif association['link_name'] == 'upon':
101 upon = association['to_concept']
102 elif association['link_name'] == 'comment':
103 vars_comment = association['link_value']
104 if annotation.get('ancillary_data'):
105 # get ctd
106 for ancillary_data in annotation['ancillary_data']:
107 if ancillary_data == 'depth_meters':
108 depth = annotation['ancillary_data']['depth_meters']
109 comment_dict['concept'] = concept_name
110 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None
111 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator']
112 comment_dict['associations'] = annotation.get('associations')
113 comment_dict['identity_reference'] = identity_reference
114 comment_dict['guide-photo'] = guide_photo
115 comment_dict['upon'] = upon
116 comment_dict['identity_certainty'] = identity_certainty
117 comment_dict['depth'] = round(depth) if depth else None
118 comment_dict['comment'] = vars_comment
119 else:
120 # tator annotation
121 if session.get('tator_token'):
122 annotation = next((loco for loco in localizations if loco['elemental_id'] == comment), None)
123 if annotation is None:
124 problem_comment = self.comments[comment]
125 problem_comment['timestamp'] = f'No timestamp available'
126 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')
127 self.missing_records.append(problem_comment)
128 continue
129 elif annotation['variant_deleted']:
130 problem_comment = self.comments[comment]
131 problem_comment['timestamp'] = f'Media ID: {annotation["media"]}, Frame: {annotation["frame"]}'
132 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')
133 self.missing_records.append(problem_comment)
134 continue
135 if annotation['attributes'].get('Good Image'):
136 comment_dict['good_image'] = True
137 else:
138 comment_dict['good_image'] = False
139 concept_name = annotation['attributes'].get('Scientific Name')
140 comment_dict['all_localizations'] = json.loads(self.comments[comment].get('all_localizations'))
141 comment_dict['scientific_name'] = concept_name
142 comment_dict['media_id'] = annotation['media']
143 comment_dict['frame'] = annotation['frame']
144 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None
145 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator']
146 if annotation.get('attributes'):
147 comment_dict['attracted'] = annotation['attributes'].get('Attracted')
148 comment_dict['frame_url'] = f'/tator/frame/{annotation["media"]}/{annotation["frame"]}'
149 comment_dict['categorical_abundance'] = annotation['attributes'].get('Categorical Abundance')
150 comment_dict['identification_remarks'] = annotation['attributes'].get('IdentificationRemarks')
151 comment_dict['morphospecies'] = annotation['attributes'].get('Morphospecies')
152 comment_dict['identified_by'] = annotation['attributes'].get('Identified By')
153 comment_dict['notes'] = annotation['attributes'].get('Notes')
154 comment_dict['qualifier'] = annotation['attributes'].get('Qualifier')
155 comment_dict['reason'] = annotation['attributes'].get('Reason')
156 comment_dict['tentative_id'] = annotation['attributes'].get('Tentative ID')
157 else:
158 annotation = {}
159 comment_dict['all_localizations'] = [{}]
160 if concept_name and concept_name not in phylogeny.keys() and concept_name not in self.no_match_records:
161 # get the phylogeny from VARS kb
162 with requests.get(url=f'{self.vars_phylogeny_url}/{concept_name}') \
163 as vars_tax_res:
164 if vars_tax_res.status_code == 200:
165 # this get us to phylum
166 try:
167 vars_tree = vars_tax_res.json()['children'][0]['children'][0]['children'][0]['children'][0]['children'][0]
168 phylogeny[concept_name] = {}
169 except KeyError:
170 if concept_name not in no_match_records:
171 no_match_records.add(concept_name)
172 print(f'{TERM_YELLOW}WARNING: Could not find phylogeny for concept "{annotation["concept"]}" in VARS knowledge base{TERM_NORMAL}')
173 vars_tree = {}
174 while 'children' in vars_tree.keys():
175 if 'rank' in vars_tree.keys(): # sometimes it's not
176 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name']
177 vars_tree = vars_tree['children'][0]
178 if 'rank' in vars_tree.keys():
179 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name']
180 else:
181 self.no_match_records.add(concept_name)
182 print(f'\n{TERM_RED}Unable to find record for {concept_name}{TERM_NORMAL}')
183 if concept_name in phylogeny.keys():
184 for key in phylogeny[concept_name].keys():
185 # split to account for worms 'Phylum (Division)' case
186 comment_dict[key.split(' ')[0]] = phylogeny[concept_name][key]
187 formatted_comments.append(comment_dict)
189 # add to dataframe for sorting
190 annotation_df = pd.DataFrame(formatted_comments, columns=[
191 'observation_uuid',
192 'concept',
193 'scientific_name',
194 'associations',
195 'all_localizations',
196 'attracted',
197 'categorical_abundance',
198 'identification_remarks',
199 'identified_by',
200 'notes',
201 'qualifier',
202 'reason',
203 'morphospecies',
204 'tentative_id',
205 'identity_certainty',
206 'identity_reference',
207 'guide_photo',
208 'good_image',
209 'media_id',
210 'frame',
211 'comment',
212 'image_url',
213 'frame_url',
214 'video_url',
215 'upon',
216 'recorded_timestamp',
217 'video_sequence_name',
218 'annotator',
219 'depth',
220 'phylum',
221 'subphylum',
222 'superclass',
223 'class',
224 'subclass',
225 'superorder',
226 'order',
227 'suborder',
228 'infraorder',
229 'superfamily',
230 'family',
231 'subfamily',
232 'genus',
233 'species'
234 ])
235 annotation_df = annotation_df.sort_values(by=[
236 'phylum',
237 'subphylum',
238 'superclass',
239 'class',
240 'subclass',
241 'superorder',
242 'order',
243 'suborder',
244 'infraorder',
245 'superfamily',
246 'family',
247 'subfamily',
248 'genus',
249 'species',
250 'concept',
251 'identity_reference',
252 'identity_certainty',
253 'recorded_timestamp'
254 ])
255 annotation_df = annotation_df.replace({pd.NA: None, np.nan: None})
256 temp_record_list = annotation_df.to_dict(orient='records')
257 for record in temp_record_list:
258 anno_dict = {}
259 for key, value in record.items():
260 if value is not None:
261 anno_dict[key] = value
262 self.distilled_records.append(anno_dict)
263 print('processed!')
265 try:
266 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f:
267 json.dump(phylogeny, f, indent=2)
268 except FileNotFoundError:
269 os.makedirs('cache')
270 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f:
271 json.dump(phylogeny, f, indent=2)