Coverage for application/image_review/external_review/comment_processor.py: 8%

164 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2025-06-23 02:22 +0000

1import json 

2import os 

3import pandas as pd 

4import numpy as np 

5import requests 

6import sys 

7 

8from flask import session 

9from json import JSONDecodeError 

10 

11from application.util.functions import * 

12from application.util.constants import TERM_RED, TERM_YELLOW, TERM_NORMAL 

13 

14 

15class CommentProcessor: 

16 """ 

17 Fetches annotation information from the VARS db on HURLSTOR and Tator given a dict of comments (key = uuid). Merges 

18 fetched annotations with the data in the comment dict into an array of dicts (self.annotations). 

19 """ 

20 def __init__(self, comments: Dict, annosaurus_url: str, vars_phylogeny_url: str, tator_localizations_url: str): 

21 self.comments = comments 

22 self.annosaurus_url = annosaurus_url 

23 self.vars_phylogeny_url = vars_phylogeny_url 

24 self.tator_localizations_url = tator_localizations_url 

25 self.distilled_records = [] 

26 self.missing_records = [] 

27 self.no_match_records = set() 

28 self.load_comments() 

29 

30 def load_comments(self): 

31 formatted_comments = [] 

32 no_match_records = set() 

33 

34 try: 

35 with open(os.path.join('cache', 'phylogeny.json'), 'r') as f: 

36 phylogeny = json.load(f) 

37 except FileNotFoundError: 

38 phylogeny = {'Animalia': {}} 

39 

40 print(f'Processing {len(self.comments)} comments...', end='') 

41 sys.stdout.flush() 

42 

43 # get all the tator localizations first, because each tator call takes forever 

44 media_ids = set() 

45 localizations = [] 

46 if session.get('tator_token'): 

47 for comment in self.comments: 

48 if 'all_localizations' in self.comments[comment].keys() and self.comments[comment]['all_localizations'] is not None: 

49 # get the media id from the video url (not stored as its own field) 

50 media_id = self.comments[comment]['video_url'].split('/')[-1].split('&')[0] 

51 media_ids.add(media_id) 

52 for i in range(0, len(media_ids), 300): # just get all localizations for each media id 

53 chunk = list(media_ids)[i:i + 300] 

54 # fixme (?) vvvv potential bug using hardcoded "26" as project id (but probably fine) vvvv 

55 get_localization_res = requests.get( 

56 url=f'{self.tator_localizations_url}/26?media_id={",".join(map(str, chunk))}', 

57 headers={ 

58 'Content-Type': 'application/json', 

59 'Authorization': f'Token {session["tator_token"]}', 

60 }) 

61 localizations += get_localization_res.json() 

62 

63 # add formatted comments to list 

64 for comment in self.comments: 

65 concept_name = None 

66 comment_dict = { 

67 'observation_uuid': comment, 

68 'image_url': self.comments[comment].get('image_url'), 

69 'video_url': self.comments[comment].get('video_url'), 

70 'video_sequence_name': self.comments[comment]['sequence'], 

71 } 

72 

73 if 'all_localizations' not in self.comments[comment].keys()\ 

74 or self.comments[comment]['all_localizations'] is None\ 

75 or self.comments[comment]['all_localizations'] == '': 

76 # vars annotation 

77 guide_photo = None 

78 upon = None 

79 identity_certainty = None 

80 identity_reference = None 

81 depth = None 

82 vars_comment = None 

83 vars_res = requests.get(url=f'{self.annosaurus_url}/annotations/{comment}') 

84 try: 

85 annotation = vars_res.json() 

86 concept_name = annotation['concept'] 

87 except (JSONDecodeError, KeyError): 

88 problem_comment = self.comments[comment] 

89 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in VARS ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}') 

90 self.missing_records.append(problem_comment) 

91 continue 

92 if annotation.get('associations'): 

93 for association in annotation['associations']: 

94 if association['link_name'] == 'identity-certainty': 

95 identity_certainty = association['link_value'] 

96 elif association['link_name'] == 'identity-reference': 

97 identity_reference = association['link_value'] 

98 elif association['link_name'] == 'guide-photo': 

99 guide_photo = association['to_concept'] 

100 elif association['link_name'] == 'upon': 

101 upon = association['to_concept'] 

102 elif association['link_name'] == 'comment': 

103 vars_comment = association['link_value'] 

104 if annotation.get('ancillary_data'): 

105 # get ctd 

106 for ancillary_data in annotation['ancillary_data']: 

107 if ancillary_data == 'depth_meters': 

108 depth = annotation['ancillary_data']['depth_meters'] 

109 comment_dict['concept'] = concept_name 

110 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None 

111 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator'] 

112 comment_dict['associations'] = annotation.get('associations') 

113 comment_dict['identity_reference'] = identity_reference 

114 comment_dict['guide-photo'] = guide_photo 

115 comment_dict['upon'] = upon 

116 comment_dict['identity_certainty'] = identity_certainty 

117 comment_dict['depth'] = round(depth) if depth else None 

118 comment_dict['comment'] = vars_comment 

119 else: 

120 # tator annotation 

121 if session.get('tator_token'): 

122 annotation = next((loco for loco in localizations if loco['elemental_id'] == comment), None) 

123 if annotation is None: 

124 problem_comment = self.comments[comment] 

125 problem_comment['timestamp'] = f'No timestamp available' 

126 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}') 

127 self.missing_records.append(problem_comment) 

128 continue 

129 elif annotation['variant_deleted']: 

130 problem_comment = self.comments[comment] 

131 problem_comment['timestamp'] = f'Media ID: {annotation["media"]}, Frame: {annotation["frame"]}' 

132 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}') 

133 self.missing_records.append(problem_comment) 

134 continue 

135 if annotation['attributes'].get('Good Image'): 

136 comment_dict['good_image'] = True 

137 else: 

138 comment_dict['good_image'] = False 

139 concept_name = annotation['attributes'].get('Scientific Name') 

140 comment_dict['all_localizations'] = json.loads(self.comments[comment].get('all_localizations')) 

141 comment_dict['scientific_name'] = concept_name 

142 comment_dict['media_id'] = annotation['media'] 

143 comment_dict['frame'] = annotation['frame'] 

144 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None 

145 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator'] 

146 if annotation.get('attributes'): 

147 comment_dict['attracted'] = annotation['attributes'].get('Attracted') 

148 comment_dict['frame_url'] = f'/tator/frame/{annotation["media"]}/{annotation["frame"]}' 

149 comment_dict['categorical_abundance'] = annotation['attributes'].get('Categorical Abundance') 

150 comment_dict['identification_remarks'] = annotation['attributes'].get('IdentificationRemarks') 

151 comment_dict['morphospecies'] = annotation['attributes'].get('Morphospecies') 

152 comment_dict['identified_by'] = annotation['attributes'].get('Identified By') 

153 comment_dict['notes'] = annotation['attributes'].get('Notes') 

154 comment_dict['qualifier'] = annotation['attributes'].get('Qualifier') 

155 comment_dict['reason'] = annotation['attributes'].get('Reason') 

156 comment_dict['tentative_id'] = annotation['attributes'].get('Tentative ID') 

157 else: 

158 annotation = {} 

159 comment_dict['all_localizations'] = [{}] 

160 if concept_name and concept_name not in phylogeny.keys() and concept_name not in self.no_match_records: 

161 # get the phylogeny from VARS kb 

162 with requests.get(url=f'{self.vars_phylogeny_url}/{concept_name}') \ 

163 as vars_tax_res: 

164 if vars_tax_res.status_code == 200: 

165 # this get us to phylum 

166 try: 

167 vars_tree = vars_tax_res.json()['children'][0]['children'][0]['children'][0]['children'][0]['children'][0] 

168 phylogeny[concept_name] = {} 

169 except KeyError: 

170 if concept_name not in no_match_records: 

171 no_match_records.add(concept_name) 

172 print(f'{TERM_YELLOW}WARNING: Could not find phylogeny for concept "{annotation["concept"]}" in VARS knowledge base{TERM_NORMAL}') 

173 vars_tree = {} 

174 while 'children' in vars_tree.keys(): 

175 if 'rank' in vars_tree.keys(): # sometimes it's not 

176 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name'] 

177 vars_tree = vars_tree['children'][0] 

178 if 'rank' in vars_tree.keys(): 

179 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name'] 

180 else: 

181 self.no_match_records.add(concept_name) 

182 print(f'\n{TERM_RED}Unable to find record for {concept_name}{TERM_NORMAL}') 

183 if concept_name in phylogeny.keys(): 

184 for key in phylogeny[concept_name].keys(): 

185 # split to account for worms 'Phylum (Division)' case 

186 comment_dict[key.split(' ')[0]] = phylogeny[concept_name][key] 

187 formatted_comments.append(comment_dict) 

188 

189 # add to dataframe for sorting 

190 annotation_df = pd.DataFrame(formatted_comments, columns=[ 

191 'observation_uuid', 

192 'concept', 

193 'scientific_name', 

194 'associations', 

195 'all_localizations', 

196 'attracted', 

197 'categorical_abundance', 

198 'identification_remarks', 

199 'identified_by', 

200 'notes', 

201 'qualifier', 

202 'reason', 

203 'morphospecies', 

204 'tentative_id', 

205 'identity_certainty', 

206 'identity_reference', 

207 'guide_photo', 

208 'good_image', 

209 'media_id', 

210 'frame', 

211 'comment', 

212 'image_url', 

213 'frame_url', 

214 'video_url', 

215 'upon', 

216 'recorded_timestamp', 

217 'video_sequence_name', 

218 'annotator', 

219 'depth', 

220 'phylum', 

221 'subphylum', 

222 'superclass', 

223 'class', 

224 'subclass', 

225 'superorder', 

226 'order', 

227 'suborder', 

228 'infraorder', 

229 'superfamily', 

230 'family', 

231 'subfamily', 

232 'genus', 

233 'species' 

234 ]) 

235 annotation_df = annotation_df.sort_values(by=[ 

236 'phylum', 

237 'subphylum', 

238 'superclass', 

239 'class', 

240 'subclass', 

241 'superorder', 

242 'order', 

243 'suborder', 

244 'infraorder', 

245 'superfamily', 

246 'family', 

247 'subfamily', 

248 'genus', 

249 'species', 

250 'concept', 

251 'identity_reference', 

252 'identity_certainty', 

253 'recorded_timestamp' 

254 ]) 

255 annotation_df = annotation_df.replace({pd.NA: None, np.nan: None}) 

256 temp_record_list = annotation_df.to_dict(orient='records') 

257 for record in temp_record_list: 

258 anno_dict = {} 

259 for key, value in record.items(): 

260 if value is not None: 

261 anno_dict[key] = value 

262 self.distilled_records.append(anno_dict) 

263 print('processed!') 

264 

265 try: 

266 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f: 

267 json.dump(phylogeny, f, indent=2) 

268 except FileNotFoundError: 

269 os.makedirs('cache') 

270 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f: 

271 json.dump(phylogeny, f, indent=2)