Coverage for concept/concept.py: 82%
99 statements
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-12 17:57 +0000
« prev ^ index » next coverage.py v7.8.0, created at 2025-04-12 17:57 +0000
1from typing import Dict
3from util.constants import *
6class Concept:
7 """
8 Stores concept information retrieved from WoRMS and the VARS kb.
9 """
11 def __init__(self, concept_name: str):
12 """
13 :param str concept_name: The VARS concept name of the organism we want to get information about.
14 """
15 self.concept_name = concept_name # the given concept name from the VARS annotation record
16 self.aphia_id = NULL_VAL_INT # to fetch from WoRMS
17 self.scientific_name = NULL_VAL_STRING # to fetch from WoRMS
18 self.authorship = NULL_VAL_STRING # to fetch from WoRMS
19 self.vernacular_names = NULL_VAL_STRING # to fetch from WoRMS
20 self.synonyms = [] # to fetch from VARS kb
21 self.taxon_rank = NULL_VAL_STRING # to fetch from WoRMS
22 self.taxon_ranks = {} # the phylogeny/taxon tree (kingdom, phylum, class, etc)
23 self.descriptors = [] # extra words from the annotation record that aren't the scientific name
24 self.concept_words = [] # for cleaning concept name
25 self.concept_add_words = [] # for cleaning concept name
26 self.cf_flag = [] # (cf = compare with) if record includes cf, should be manually reviewed
27 self.nr_flag = [] # (nr = near) should be manually reviewed
28 self.aff_flag = [] # (aff = looks similar to) should be manually reviewed
29 self.sp_flag = False # used to check whether to append 'sp.' to scientific name
31 self.analyze_concept_name()
33 def flatten_taxa_tree(self, tree: Dict, flat: Dict):
34 """
35 Recursive function taking a taxonomy tree returned from WoRMS API. Flattens tree and saves to self.
37 :param Dict tree: The nested taxon tree from WoRMS.
38 :param Dict flat: The newly created flat taxon tree.
39 """
40 flat[tree['rank']] = tree['scientificname']
41 if tree['child'] is not None:
42 self.flatten_taxa_tree(tree['child'], flat)
43 elif self.cf_flag:
44 ranks = ['Species', 'Genus', 'Family', 'Order', 'Class', 'Phylum']
45 for i in range(1, 6): # from Genus -> Phylum
46 if tree['rank'] == ranks[i]:
47 flat[ranks[i - 1]] = f'cf. {" ".join(self.cf_flag)}' # add 'cf. [concept]' to the rank below
49 def load_from_record(self, record: Dict):
50 """
51 Assigns concept values given JSON object.
53 :param Dict record: The JSON object to load data from.
54 """
55 self.aphia_id = record['AphiaID']
56 self.scientific_name = record['scientificname']
57 self.taxon_rank = record['rank']
58 if self.sp_flag:
59 self.scientific_name += ' sp.'
60 if self.cf_flag:
61 self.scientific_name += ' cf.'
62 for name in self.cf_flag:
63 self.scientific_name += ' ' + name
64 if self.nr_flag:
65 self.scientific_name += ' nr.'
66 for name in self.nr_flag:
67 self.scientific_name += ' ' + name
68 if self.aff_flag:
69 self.scientific_name += ' aff.'
70 for name in self.aff_flag:
71 self.scientific_name += ' ' + name
72 if record['authority'] is not None:
73 self.authorship = record['authority']
75 def analyze_concept_name(self):
76 """
77 Analyzes 'extra bits' (eg 'cf', 'sp', '/') off the VARS concept name:
79 EXAMPLE 1:
80 VARS concept name = '[genus] [species] cf'
81 'cf' means compare with, basically we're sure of the genus but not sure of species... but, it looks "close
82 to" this species. We must only report the genus because it's the only thing we're sure of.
83 We fetch the genus info from WoRMS and we save [species] and 'cf' locally to add back later.
84 The record is populated with the genus info from WoRMS, and the final scientific name reported to DSCRTP is:
85 [genus] cf. [species]
86 This is the same for aff. (looks similar to) and nr. (near)
88 EXAMPLE 2:
89 VARS concept name = '[genus] cf sp'
90 '[genus] cf' means we're pretty sure it's this genus, but not 100%.
91 The 'sp' means it's a species in that genus.
92 In this case, we don't care about the 'sp'.
93 The record is populated with the FAMILY info from WoRMS (we need to get the genus's parent, then get the
94 parent info)
95 The final scientific name reported to DSCRTP is: 'cf. [genus]'
96 """
98 if '/' in self.concept_name: # account for concepts with slashes in name, e.g. "Ptilella/Pennatula"
99 self.concept_words = ['NEED_PARENT']
100 return
102 self.concept_words = self.concept_name.split(' ') # create an array of the VARS concept name
103 if 'unidentified' in self.concept_words:
104 self.concept_words.remove('unidentified')
106 # [genus] [species] cf: entity is identified as species within [genus], similar to [species] but not sure
107 # [genus] cf: entity is similar to [genus], but not sure
108 # can also be [phylum] cf, [class] cf, [subclass] cf, [order] cf, [family] cf
109 if 'cf' in self.concept_words:
110 if 'sp' in self.concept_words: # if sp is in this list, just remove it
111 del self.concept_words[self.concept_words.index('sp')]
112 cf_index = self.concept_words.index('cf') # get where cf is in the list
114 # if cf is the second item in the list, we need to query worms for concept's PARENT
115 if cf_index == 1: # this is the '[genus] cf' case
116 del self.concept_words[1] # remove cf from list
117 self.cf_flag = self.concept_words # we'll use this list to add words back at the very end
118 self.concept_words = ['NEED_PARENT'] # get the parent of this concept from HURL later
120 else: # this is the '[genus] [species] cf' case
121 # add the word before cf to concept_add_words
122 self.concept_add_words.append(self.concept_words[cf_index - 1])
123 del self.concept_words[cf_index - 1] # delete that word from the list
124 del self.concept_words[cf_index - 1] # delete cf from the list
125 self.cf_flag = self.concept_add_words
127 # [genus] sp: entity is identified as a species within the genus [genus], but the species is unknown
128 elif 'sp' in self.concept_words and 'n' not in self.concept_words:
129 self.sp_flag = True # set this to true so we can add 'sp.' at the end
130 sp_index = self.concept_words.index('sp') # get where sp is in the list
131 while sp_index < len(self.concept_words): # get rid of all list items after sp (including sp)
132 if self.concept_words[sp_index] != 'sp': # append all items except sp to descriptors list
133 self.descriptors.append(self.concept_words[sp_index])
134 del self.concept_words[sp_index]
136 # [genus] nr [species] &opt[subspecies]: identified as species within [genus], similar to [species] but not sure
137 if 'nr' in self.concept_words:
138 nr_index = self.concept_words.index('nr') # get where nr is in the list
139 while nr_index < len(self.concept_words): # get rid of all list items after nr (including nr)
140 if self.concept_words[nr_index] != 'nr': # append all items except nr to nr_flag
141 self.nr_flag.append(self.concept_words[nr_index])
142 del self.concept_words[nr_index]
144 # [genus] aff [species]: identified as species within [genus], similar to [species] but not sure
145 # same as nr
146 if 'aff' in self.concept_words:
147 aff_index = self.concept_words.index('aff') # get where aff is in the list
148 while aff_index < len(self.concept_words): # get rid of all list items after aff (including aff)
149 if self.concept_words[aff_index] != 'aff': # append all items except aff to aff_flag
150 self.aff_flag.append(self.concept_words[aff_index])
151 del self.concept_words[aff_index]
153 # [genus] n sp -> scientific name: '[genus]', descriptors: 'Undescribed species'
154 # [genus] (n subgenus) n sp -> scientific name: '[genus]', descriptors: 'Undescribed subgenus, undescribed species'
155 # [family] n gen -> scientific name: '[family]', descriptors: 'Undescribed genus'
156 if 'n' in self.concept_words:
157 n_index = self.concept_words.index('n') # get where n is in the list
158 if 'gen' in self.concept_words:
159 self.descriptors.append('Undescribed genus')
160 elif 'subgenus)' in self.concept_words:
161 n_index = self.concept_words.index('(n') # get where n is in the list
162 self.descriptors.append('Undescribed subgenus, undescribed species')
163 else:
164 self.descriptors.append('Undescribed species')
166 while n_index < len(self.concept_words): # get rid of all list items after n (including n)
167 # append all items except n and sp, gen, subgenus to descriptors list
168 if self.concept_words[n_index] != 'n' and self.concept_words[n_index] != 'sp' \
169 and self.concept_words[n_index] != 'gen' and self.concept_words[n_index] != 'subgenus)'\
170 and self.concept_words[n_index] != '(n':
171 self.descriptors.append(self.concept_words[n_index])
172 del self.concept_words[n_index]