Coverage for application / tator / tator_sub_qaqc_processor.py: 14%
125 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-23 05:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-23 05:22 +0000
1import tator
3from application.tator.tator_base_qaqc_processor import TatorBaseQaqcProcessor
4from application.tator.tator_type import TatorLocalizationType
7class TatorSubQaqcProcessor(TatorBaseQaqcProcessor):
8 def __init__(
9 self,
10 project_id: int,
11 section_ids: list[str],
12 api: tator.api,
13 tator_url: str,
14 darc_review_url: str = None,
15 transect_media_ids: list[int] = None,
16 ):
17 super().__init__(
18 project_id=project_id,
19 section_ids=section_ids,
20 api=api,
21 darc_review_url=darc_review_url,
22 tator_url=tator_url,
23 transect_media_ids=transect_media_ids,
24 )
26 def check_missing_ancillary_data(self):
27 """
28 Finds records that are missing ancillary data attributes:
30 * "DO Temperature (celsius)" (do_temp_c)
31 * "DO Concentration Salin Comp (mol per L)" (do_concentration_salin_comp_mol_L)
32 * "Depth" (depth_m)
33 """
34 self.process_records()
35 actual_final_records = []
36 for record in self.final_records:
37 if (not record.get('do_temp_c')
38 or not record.get('do_concentration_salin_comp_mol_L')
39 or not record.get('depth_m')):
40 actual_final_records.append(record)
41 self.final_records = actual_final_records
43 def check_missing_upon_and_not_fish(self):
44 """
45 Finds records that are missing the "upon" attribute and are not a fish.
46 """
47 self.process_records()
48 actual_final_records = []
49 for record in self.final_records:
50 if (not record.get('upon')
51 or record['upon'] in {'--', '-', ''}
52 or ('water' in record['upon'].lower() and record['phylum'] != 'Chordata')):
53 record['problems'] = 'Upon'
54 actual_final_records.append(record)
55 self.final_records = actual_final_records
57 def check_upons_are_current_substrate_or_previous_animal(self, transect_media: list[dict]):
58 """
59 Finds records where the "upon" attribute is not a substring of any value in the current substrate
60 at the record's frame, and is not the scientific name of any animal previously recorded in the same
61 media (skips upons with "water").
62 """
63 self.process_records()
64 substrates = {
65 substrate['media_id']: substrate['substrates']
66 for substrate in
67 self.tator_client.get_substrates_for_medias(project_id=self.project_id, transect_media=transect_media)
68 }
69 for media_id, substrate_entries in substrates.items():
70 print(f'Substrates for media {media_id}:')
71 for substrate in substrate_entries:
72 print(f' Frame {substrate["frame"]}: { {k: v for k, v in substrate.items() if k not in ("frame", "timestamp")} }')
73 self.final_records.sort(key=lambda _record: (_record['media_id'], _record['frame']))
74 actual_final_records = []
75 seen_animals: dict[int, set] = {} # media_id -> set of scientific names seen so far in that media
76 for record in self.final_records:
77 media_id = record['media_id']
78 frame = record['frame']
79 upon = record.get('upon')
80 if upon and 'water' not in upon.lower():
81 is_upon_in_current_substrate = self._upon_matches_substrate(upon, substrates.get(media_id, []), frame)
82 is_upon_is_previous_animal = upon in seen_animals.get(media_id, set())
83 if is_upon_is_previous_animal:
84 print(f'Matched upon "{upon}" for record at frame {frame} to previously seen animal')
85 if not is_upon_in_current_substrate and not is_upon_is_previous_animal:
86 print(f'No match found for upon "{upon}" for record at frame {frame}')
87 record['problems'] = 'Upon'
88 record['substrate'] = self._get_substrate_for_frame(substrates.get(media_id, []), frame)
89 actual_final_records.append(record)
90 seen_animals.setdefault(media_id, set()).add(record['scientific_name'])
91 self.final_records = actual_final_records
93 @staticmethod
94 def _upon_matches_substrate(upon: str, substrate_entries: list[dict], frame: int) -> bool:
95 """
96 Returns True if upon is a substring of any substrate value in the current substrate state at the given frame.
97 """
98 invalid_values = {'--', '-', '', 'Not Set', 'None'}
99 current_substrate = TatorSubQaqcProcessor._get_substrate_for_frame(substrate_entries, frame)
100 if current_substrate is None:
101 return False
102 for key, val in current_substrate.items():
103 if key in ('frame', 'timestamp'):
104 continue
105 if val not in invalid_values and upon.lower() in val.lower():
106 print(f'Matched upon "{upon}" for record at frame {frame} to current {key} "{val}"')
107 return True
108 return False
110 @staticmethod
111 def _get_substrate_for_frame(substrate_entries: list[dict], frame: int) -> dict:
112 """
113 Returns the substrate state at the given frame, or None if there is no substrate state at or before that frame.
114 """
115 current_substrate = None
116 for substrate in substrate_entries:
117 if substrate['frame'] > frame:
118 break
119 current_substrate = substrate
120 return current_substrate
122 def get_suspicious_records(self):
123 """
124 Finds records where the "upon" attribute is suspicious, i.e. the same as the scientific name.
125 """
126 self.process_records()
127 actual_final_records = []
128 for record in self.final_records:
129 if record['scientific_name'] == record.get('upon'):
130 record['problems'] = 'Scientific Name,Upon'
131 actual_final_records.append(record)
132 self.final_records = actual_final_records
134 def find_long_host_associate_time_diff(self):
135 """
136 Finds records where the "upon" attribute is an organism and there is either no previous record of that
137 organism in the same media, or the closest previous record of that organism in the same media is more
138 than one minute before the record.
139 """
140 self.process_records()
141 media = self.tator_client.get_media_by_id(self.final_records[0]['media_id'])
142 fps = media['fps'] # assume all media in a deployment are the same FPS
143 actual_final_records = []
144 self.final_records.sort(key=lambda _record: (_record['media_id'], _record['frame']))
145 for i, record in enumerate(self.final_records):
146 upon = record.get('upon')
147 if not upon or 'Non-uniform values across dots' in upon:
148 # check_missing_upon_and_not_fish handles missing upons
149 # check_upons_are_current_substrate_or_previous_animal handles non-uniform dots
150 continue
151 if upon[0].isupper(): # expect substrates to be lowercase, scientific IDs to be capitalized
152 most_recent_matching_host = None
153 # start a lil ahead because we can have multiple localizations at the same timestamp
154 j = min(i + 10, len(self.final_records) - 1)
155 # make sure we're only looking at the current media_id and same timestamps
156 while (j >= i and
157 (self.final_records[j]['media_id'] != record['media_id']
158 or self.final_records[j]['frame'] > record['frame'])):
159 j -= 1
160 while j >= 0 and self.final_records[j]['media_id'] == record['media_id']:
161 if self.final_records[j]['scientific_name'] == upon:
162 most_recent_matching_host = self.final_records[j]
163 break
164 j -= 1
165 if not most_recent_matching_host:
166 record['host_upon_time_diff'] = 'Unable to find matching host in previous records'
167 actual_final_records.append(record)
168 continue
169 time_diff_seconds = int((record['frame'] - most_recent_matching_host['frame']) / fps)
170 if time_diff_seconds > 60:
171 record['host_upon_time_diff'] = \
172 f'Most recent occurrence of "{upon}" more than 1 minute ago ({time_diff_seconds} seconds)'
173 actual_final_records.append(record)
174 self.final_records = actual_final_records
176 def get_unique_taxa(self):
177 self.process_records()
178 unique_taxa = {}
179 for record in self.final_records:
180 scientific_name = record.get('scientific_name')
181 tentative_id = record.get('tentative_id', '')
182 morphospecies = record.get('morphospecies', '')
183 key = f'{scientific_name}:{tentative_id}:{morphospecies}'
184 if key not in unique_taxa.keys():
185 # add new unique taxa to dict
186 unique_taxa[key] = {
187 'scientific_name': scientific_name,
188 'tentative_id': tentative_id,
189 'morphospecies': morphospecies,
190 'box_count': 0,
191 'dot_count': 0,
192 }
193 for localization in record['all_localizations']:
194 # increment box/dot counts
195 if TatorLocalizationType.is_box(localization['type']):
196 unique_taxa[key]['box_count'] += 1
197 elif TatorLocalizationType.is_dot(localization['type']):
198 unique_taxa[key]['dot_count'] += 1
199 self.final_records = unique_taxa
201 def get_summary(self):
202 raise NotImplementedError('TatorSubQaqcProcessor does not implement get_summary')
204 def download_image_guide(self, app):
205 raise NotImplementedError('TatorSubQaqcProcessor does not implement download_image_guide')