Coverage for application/image_review/external_review/comment

1import json

2import os

3import pandas as pd

4import numpy as np

5import requests

6import sys

8from flask import session

9from json import JSONDecodeError

11from application.util.functions import *

12from application.util.constants import TERM_RED, TERM_YELLOW, TERM_NORMAL

15class CommentProcessor:

16 """

17 Fetches annotation information from the VARS db on HURLSTOR and Tator given a dict of comments (key = uuid). Merges

18 fetched annotations with the data in the comment dict into an array of dicts (self.annotations).

19 """

20 def __init__(self, comments: Dict, annosaurus_url: str, vars_phylogeny_url: str, tator_localizations_url: str):

21 self.comments = comments

22 self.annosaurus_url = annosaurus_url

23 self.vars_phylogeny_url = vars_phylogeny_url

24 self.tator_localizations_url = tator_localizations_url

25 self.distilled_records = []

26 self.missing_records = []

27 self.no_match_records = set()

28 self.load_comments()

30 def load_comments(self):

31 formatted_comments = []

32 no_match_records = set()

34 try:

35 with open(os.path.join('cache', 'phylogeny.json'), 'r') as f:

36 phylogeny = json.load(f)

37 except FileNotFoundError:

38 phylogeny = {'Animalia': {}}

40 print(f'Processing {len(self.comments)} comments...', end='')

41 sys.stdout.flush()

43 # get all the tator localizations first, because each tator call takes forever

44 media_ids = set()

45 localizations = []

46 if session.get('tator_token'):

47 for comment in self.comments:

48 if 'all_localizations' in self.comments[comment].keys() and self.comments[comment]['all_localizations'] is not None:

49 # get the media id from the video url (not stored as its own field)

50 media_id = self.comments[comment]['video_url'].split('/')[-1].split('&')[0]

51 media_ids.add(media_id)

52 for i in range(0, len(media_ids), 300): # just get all localizations for each media id

53 chunk = list(media_ids)[i:i + 300]

54 # fixme (?) vvvv potential bug using hardcoded "26" as project id (but probably fine) vvvv

55 get_localization_res = requests.get(

56 url=f'{self.tator_localizations_url}/26?media_id={",".join(map(str, chunk))}',

57 headers={

58 'Content-Type': 'application/json',

59 'Authorization': f'Token {session["tator_token"]}',

60 })

61 localizations += get_localization_res.json()

63 # add formatted comments to list

64 for comment in self.comments:

65 concept_name = None

66 comment_dict = {

67 'observation_uuid': comment,

68 'image_url': self.comments[comment].get('image_url'),

69 'video_url': self.comments[comment].get('video_url'),

70 'video_sequence_name': self.comments[comment]['sequence'],

71 }

73 if 'all_localizations' not in self.comments[comment].keys()\

74 or self.comments[comment]['all_localizations'] is None\

75 or self.comments[comment]['all_localizations'] == '':

76 # vars annotation

77 guide_photo = None

78 upon = None

79 identity_certainty = None

80 identity_reference = None

81 depth = None

82 vars_comment = None

83 vars_res = requests.get(url=f'{self.annosaurus_url}/annotations/{comment}')

84 try:

85 annotation = vars_res.json()

86 concept_name = annotation['concept']

87 except (JSONDecodeError, KeyError):

88 problem_comment = self.comments[comment]

89 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in VARS ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')

90 self.missing_records.append(problem_comment)

91 continue

92 if annotation.get('associations'):

93 for association in annotation['associations']:

94 if association['link_name'] == 'identity-certainty':

95 identity_certainty = association['link_value']

96 elif association['link_name'] == 'identity-reference':

97 identity_reference = association['link_value']

98 elif association['link_name'] == 'guide-photo':

99 guide_photo = association['to_concept']

100 elif association['link_name'] == 'upon':

101 upon = association['to_concept']

102 elif association['link_name'] == 'comment':

103 vars_comment = association['link_value']

104 if annotation.get('ancillary_data'):

105 # get ctd

106 for ancillary_data in annotation['ancillary_data']:

107 if ancillary_data == 'depth_meters':

108 depth = annotation['ancillary_data']['depth_meters']

109 comment_dict['concept'] = concept_name

110 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None

111 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator']

112 comment_dict['associations'] = annotation.get('associations')

113 comment_dict['identity_reference'] = identity_reference

114 comment_dict['guide-photo'] = guide_photo

115 comment_dict['upon'] = upon

116 comment_dict['identity_certainty'] = identity_certainty

117 comment_dict['depth'] = round(depth) if depth else None

118 comment_dict['comment'] = vars_comment

119 else:

120 # tator annotation

121 if session.get('tator_token'):

122 annotation = next((loco for loco in localizations if loco['elemental_id'] == comment), None)

123 if annotation is None:

124 problem_comment = self.comments[comment]

125 problem_comment['timestamp'] = f'No timestamp available'

126 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')

127 self.missing_records.append(problem_comment)

128 continue

129 elif annotation['variant_deleted']:

130 problem_comment = self.comments[comment]

131 problem_comment['timestamp'] = f'Media ID: {annotation["media"]}, Frame: {annotation["frame"]}'

132 print(f'{TERM_RED}ERROR: Could not find annotation with UUID {comment} in Tator ({problem_comment["sequence"]}, {problem_comment["timestamp"]}){TERM_NORMAL}')

133 self.missing_records.append(problem_comment)

134 continue

135 if annotation['attributes'].get('Good Image'):

136 comment_dict['good_image'] = True

137 else:

138 comment_dict['good_image'] = False

139 concept_name = annotation['attributes'].get('Scientific Name')

140 comment_dict['all_localizations'] = json.loads(self.comments[comment].get('all_localizations'))

141 comment_dict['scientific_name'] = concept_name

142 comment_dict['media_id'] = annotation['media']

143 comment_dict['frame'] = annotation['frame']

144 comment_dict['recorded_timestamp'] = parse_datetime(annotation['recorded_timestamp']).strftime('%d %b %y %H:%M:%S UTC') if 'recorded_timestamp' in annotation.keys() else None

145 comment_dict['annotator'] = format_annotator(annotation['observer']) if 'observer' in annotation.keys() else self.comments[comment]['annotator']

146 if annotation.get('attributes'):

147 comment_dict['attracted'] = annotation['attributes'].get('Attracted')

148 comment_dict['frame_url'] = f'/tator/frame/{annotation["media"]}/{annotation["frame"]}'

149 comment_dict['categorical_abundance'] = annotation['attributes'].get('Categorical Abundance')

150 comment_dict['identification_remarks'] = annotation['attributes'].get('IdentificationRemarks')

151 comment_dict['morphospecies'] = annotation['attributes'].get('Morphospecies')

152 comment_dict['identified_by'] = annotation['attributes'].get('Identified By')

153 comment_dict['notes'] = annotation['attributes'].get('Notes')

154 comment_dict['qualifier'] = annotation['attributes'].get('Qualifier')

155 comment_dict['reason'] = annotation['attributes'].get('Reason')

156 comment_dict['tentative_id'] = annotation['attributes'].get('Tentative ID')

157 else:

158 annotation = {}

159 comment_dict['all_localizations'] = [{}]

160 if concept_name and concept_name not in phylogeny.keys() and concept_name not in self.no_match_records:

161 # get the phylogeny from VARS kb

162 with requests.get(url=f'{self.vars_phylogeny_url}/{concept_name}') \

163 as vars_tax_res:

164 if vars_tax_res.status_code == 200:

165 # this get us to phylum

166 try:

167 vars_tree = vars_tax_res.json()['children'][0]['children'][0]['children'][0]['children'][0]['children'][0]

168 phylogeny[concept_name] = {}

169 except KeyError:

170 if concept_name not in no_match_records:

171 no_match_records.add(concept_name)

172 print(f'{TERM_YELLOW}WARNING: Could not find phylogeny for concept "{annotation["concept"]}" in VARS knowledge base{TERM_NORMAL}')

173 vars_tree = {}

174 while 'children' in vars_tree.keys():

175 if 'rank' in vars_tree.keys(): # sometimes it's not

176 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name']

177 vars_tree = vars_tree['children'][0]

178 if 'rank' in vars_tree.keys():

179 phylogeny[concept_name][vars_tree['rank']] = vars_tree['name']

180 else:

181 self.no_match_records.add(concept_name)

182 print(f'\n{TERM_RED}Unable to find record for {concept_name}{TERM_NORMAL}')

183 if concept_name in phylogeny.keys():

184 for key in phylogeny[concept_name].keys():

185 # split to account for worms 'Phylum (Division)' case

186 comment_dict[key.split(' ')[0]] = phylogeny[concept_name][key]

187 formatted_comments.append(comment_dict)

188

189 # add to dataframe for sorting

190 annotation_df = pd.DataFrame(formatted_comments, columns=[

191 'observation_uuid',

192 'concept',

193 'scientific_name',

194 'associations',

195 'all_localizations',

196 'attracted',

197 'categorical_abundance',

198 'identification_remarks',

199 'identified_by',

200 'notes',

201 'qualifier',

202 'reason',

203 'morphospecies',

204 'tentative_id',

205 'identity_certainty',

206 'identity_reference',

207 'guide_photo',

208 'good_image',

209 'media_id',

210 'frame',

211 'comment',

212 'image_url',

213 'frame_url',

214 'video_url',

215 'upon',

216 'recorded_timestamp',

217 'video_sequence_name',

218 'annotator',

219 'depth',

220 'phylum',

221 'subphylum',

222 'superclass',

223 'class',

224 'subclass',

225 'superorder',

226 'order',

227 'suborder',

228 'infraorder',

229 'superfamily',

230 'family',

231 'subfamily',

232 'genus',

233 'species'

234 ])

235 annotation_df = annotation_df.sort_values(by=[

236 'phylum',

237 'subphylum',

238 'superclass',

239 'class',

240 'subclass',

241 'superorder',

242 'order',

243 'suborder',

244 'infraorder',

245 'superfamily',

246 'family',

247 'subfamily',

248 'genus',

249 'species',

250 'concept',

251 'identity_reference',

252 'identity_certainty',

253 'recorded_timestamp'

254 ])

255 annotation_df = annotation_df.replace({pd.NA: None, np.nan: None})

256 temp_record_list = annotation_df.to_dict(orient='records')

257 for record in temp_record_list:

258 anno_dict = {}

259 for key, value in record.items():

260 if value is not None:

261 anno_dict[key] = value

262 self.distilled_records.append(anno_dict)

263 print('processed!')

264

265 try:

266 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f:

267 json.dump(phylogeny, f, indent=2)

268 except FileNotFoundError:

269 os.makedirs('cache')

270 with open(os.path.join('cache', 'phylogeny.json'), 'w') as f:

271 json.dump(phylogeny, f, indent=2)

Coverage for application/image_review/external_review/comment_processor.py: 8%

164 statements