Coverage for concept/concept.py: 82%

99 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-12 17:57 +0000

1from typing import Dict 

2 

3from util.constants import * 

4 

5 

6class Concept: 

7 """ 

8 Stores concept information retrieved from WoRMS and the VARS kb. 

9 """ 

10 

11 def __init__(self, concept_name: str): 

12 """ 

13 :param str concept_name: The VARS concept name of the organism we want to get information about. 

14 """ 

15 self.concept_name = concept_name # the given concept name from the VARS annotation record 

16 self.aphia_id = NULL_VAL_INT # to fetch from WoRMS 

17 self.scientific_name = NULL_VAL_STRING # to fetch from WoRMS 

18 self.authorship = NULL_VAL_STRING # to fetch from WoRMS 

19 self.vernacular_names = NULL_VAL_STRING # to fetch from WoRMS 

20 self.synonyms = [] # to fetch from VARS kb 

21 self.taxon_rank = NULL_VAL_STRING # to fetch from WoRMS 

22 self.taxon_ranks = {} # the phylogeny/taxon tree (kingdom, phylum, class, etc) 

23 self.descriptors = [] # extra words from the annotation record that aren't the scientific name 

24 self.concept_words = [] # for cleaning concept name 

25 self.concept_add_words = [] # for cleaning concept name 

26 self.cf_flag = [] # (cf = compare with) if record includes cf, should be manually reviewed 

27 self.nr_flag = [] # (nr = near) should be manually reviewed 

28 self.aff_flag = [] # (aff = looks similar to) should be manually reviewed 

29 self.sp_flag = False # used to check whether to append 'sp.' to scientific name 

30 

31 self.analyze_concept_name() 

32 

33 def flatten_taxa_tree(self, tree: Dict, flat: Dict): 

34 """ 

35 Recursive function taking a taxonomy tree returned from WoRMS API. Flattens tree and saves to self. 

36 

37 :param Dict tree: The nested taxon tree from WoRMS. 

38 :param Dict flat: The newly created flat taxon tree. 

39 """ 

40 flat[tree['rank']] = tree['scientificname'] 

41 if tree['child'] is not None: 

42 self.flatten_taxa_tree(tree['child'], flat) 

43 elif self.cf_flag: 

44 ranks = ['Species', 'Genus', 'Family', 'Order', 'Class', 'Phylum'] 

45 for i in range(1, 6): # from Genus -> Phylum 

46 if tree['rank'] == ranks[i]: 

47 flat[ranks[i - 1]] = f'cf. {" ".join(self.cf_flag)}' # add 'cf. [concept]' to the rank below 

48 

49 def load_from_record(self, record: Dict): 

50 """ 

51 Assigns concept values given JSON object. 

52 

53 :param Dict record: The JSON object to load data from. 

54 """ 

55 self.aphia_id = record['AphiaID'] 

56 self.scientific_name = record['scientificname'] 

57 self.taxon_rank = record['rank'] 

58 if self.sp_flag: 

59 self.scientific_name += ' sp.' 

60 if self.cf_flag: 

61 self.scientific_name += ' cf.' 

62 for name in self.cf_flag: 

63 self.scientific_name += ' ' + name 

64 if self.nr_flag: 

65 self.scientific_name += ' nr.' 

66 for name in self.nr_flag: 

67 self.scientific_name += ' ' + name 

68 if self.aff_flag: 

69 self.scientific_name += ' aff.' 

70 for name in self.aff_flag: 

71 self.scientific_name += ' ' + name 

72 if record['authority'] is not None: 

73 self.authorship = record['authority'] 

74 

75 def analyze_concept_name(self): 

76 """ 

77 Analyzes 'extra bits' (eg 'cf', 'sp', '/') off the VARS concept name: 

78 

79 EXAMPLE 1: 

80 VARS concept name = '[genus] [species] cf' 

81 'cf' means compare with, basically we're sure of the genus but not sure of species... but, it looks "close 

82 to" this species. We must only report the genus because it's the only thing we're sure of. 

83 We fetch the genus info from WoRMS and we save [species] and 'cf' locally to add back later. 

84 The record is populated with the genus info from WoRMS, and the final scientific name reported to DSCRTP is: 

85 [genus] cf. [species] 

86 This is the same for aff. (looks similar to) and nr. (near) 

87 

88 EXAMPLE 2: 

89 VARS concept name = '[genus] cf sp' 

90 '[genus] cf' means we're pretty sure it's this genus, but not 100%. 

91 The 'sp' means it's a species in that genus. 

92 In this case, we don't care about the 'sp'. 

93 The record is populated with the FAMILY info from WoRMS (we need to get the genus's parent, then get the 

94 parent info) 

95 The final scientific name reported to DSCRTP is: 'cf. [genus]' 

96 """ 

97 

98 if '/' in self.concept_name: # account for concepts with slashes in name, e.g. "Ptilella/Pennatula" 

99 self.concept_words = ['NEED_PARENT'] 

100 return 

101 

102 self.concept_words = self.concept_name.split(' ') # create an array of the VARS concept name 

103 if 'unidentified' in self.concept_words: 

104 self.concept_words.remove('unidentified') 

105 

106 # [genus] [species] cf: entity is identified as species within [genus], similar to [species] but not sure 

107 # [genus] cf: entity is similar to [genus], but not sure 

108 # can also be [phylum] cf, [class] cf, [subclass] cf, [order] cf, [family] cf 

109 if 'cf' in self.concept_words: 

110 if 'sp' in self.concept_words: # if sp is in this list, just remove it 

111 del self.concept_words[self.concept_words.index('sp')] 

112 cf_index = self.concept_words.index('cf') # get where cf is in the list 

113 

114 # if cf is the second item in the list, we need to query worms for concept's PARENT 

115 if cf_index == 1: # this is the '[genus] cf' case 

116 del self.concept_words[1] # remove cf from list 

117 self.cf_flag = self.concept_words # we'll use this list to add words back at the very end 

118 self.concept_words = ['NEED_PARENT'] # get the parent of this concept from HURL later 

119 

120 else: # this is the '[genus] [species] cf' case 

121 # add the word before cf to concept_add_words 

122 self.concept_add_words.append(self.concept_words[cf_index - 1]) 

123 del self.concept_words[cf_index - 1] # delete that word from the list 

124 del self.concept_words[cf_index - 1] # delete cf from the list 

125 self.cf_flag = self.concept_add_words 

126 

127 # [genus] sp: entity is identified as a species within the genus [genus], but the species is unknown 

128 elif 'sp' in self.concept_words and 'n' not in self.concept_words: 

129 self.sp_flag = True # set this to true so we can add 'sp.' at the end 

130 sp_index = self.concept_words.index('sp') # get where sp is in the list 

131 while sp_index < len(self.concept_words): # get rid of all list items after sp (including sp) 

132 if self.concept_words[sp_index] != 'sp': # append all items except sp to descriptors list 

133 self.descriptors.append(self.concept_words[sp_index]) 

134 del self.concept_words[sp_index] 

135 

136 # [genus] nr [species] &opt[subspecies]: identified as species within [genus], similar to [species] but not sure 

137 if 'nr' in self.concept_words: 

138 nr_index = self.concept_words.index('nr') # get where nr is in the list 

139 while nr_index < len(self.concept_words): # get rid of all list items after nr (including nr) 

140 if self.concept_words[nr_index] != 'nr': # append all items except nr to nr_flag 

141 self.nr_flag.append(self.concept_words[nr_index]) 

142 del self.concept_words[nr_index] 

143 

144 # [genus] aff [species]: identified as species within [genus], similar to [species] but not sure 

145 # same as nr 

146 if 'aff' in self.concept_words: 

147 aff_index = self.concept_words.index('aff') # get where aff is in the list 

148 while aff_index < len(self.concept_words): # get rid of all list items after aff (including aff) 

149 if self.concept_words[aff_index] != 'aff': # append all items except aff to aff_flag 

150 self.aff_flag.append(self.concept_words[aff_index]) 

151 del self.concept_words[aff_index] 

152 

153 # [genus] n sp -> scientific name: '[genus]', descriptors: 'Undescribed species' 

154 # [genus] (n subgenus) n sp -> scientific name: '[genus]', descriptors: 'Undescribed subgenus, undescribed species' 

155 # [family] n gen -> scientific name: '[family]', descriptors: 'Undescribed genus' 

156 if 'n' in self.concept_words: 

157 n_index = self.concept_words.index('n') # get where n is in the list 

158 if 'gen' in self.concept_words: 

159 self.descriptors.append('Undescribed genus') 

160 elif 'subgenus)' in self.concept_words: 

161 n_index = self.concept_words.index('(n') # get where n is in the list 

162 self.descriptors.append('Undescribed subgenus, undescribed species') 

163 else: 

164 self.descriptors.append('Undescribed species') 

165 

166 while n_index < len(self.concept_words): # get rid of all list items after n (including n) 

167 # append all items except n and sp, gen, subgenus to descriptors list 

168 if self.concept_words[n_index] != 'n' and self.concept_words[n_index] != 'sp' \ 

169 and self.concept_words[n_index] != 'gen' and self.concept_words[n_index] != 'subgenus)'\ 

170 and self.concept_words[n_index] != '(n': 

171 self.descriptors.append(self.concept_words[n_index]) 

172 del self.concept_words[n_index]