NCIT编号转NCBI编号
1. 数据库介绍
NCIt (National Cancer Institute Thesaurus)是由美国国家癌症研究所(NCI)维护的一个生物医学术语本体库。它提供了一个广泛的、标准化的生物医学术语集,专注于癌症以及与之相关的领域,包括临床护理、生物医学研究、分子生物学、遗传学等。NCIt是在医学研究和临床实践中被广泛用于数据交换、注释和分析的关键资源。
在NCIt数据库官网提供的映射对应列表中并没有提供关于微生物术语转最常见的NCBI编号的映射表,因此这里我提供一个爬虫的方式以实现对NCIt中的微生物编号向NCBI的tax_ID转化。
2.爬虫函数
# -*- coding: utf-8 -*-
# @Author : lantary
# @Email : lantary-w@qq.com
# @Blog : https://lantary.cn
import requests
import json
from bs4 import BeautifulSoup
def NCIt_to_NCBI(NCIt_id: str, temp_file: str = 'temp.json') -> str or None:
"""
Get NCBI ID from NCIt ID
:param NCIt_id: NCIt ID
:param temp_file: 临时储存文件,避免重复请求
:return: NCBI ID
"""
start_url = 'https://ncithesaurus.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=' + NCIt_id
try:
with open(temp_file, 'r') as f:
fetched_ids = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
fetched_ids = {}
# 在爬取数据前检查本地临时文件
if NCIt_id in fetched_ids:
NCBI_id = fetched_ids[NCIt_id]
print(f'already fetched {NCIt_id}, {NCIt_id} -> {NCBI_id}')
return str(NCBI_id)
response = requests.get(start_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
dataCell_list = soup.find_all(
'td', attrs={'class': 'dataCellText', 'scope': 'row'}
)
for dataCell in dataCell_list:
if dataCell.get_text() == 'NCBI_Taxon_ID':
NCBI_id = dataCell.find_next_sibling('td').get_text()
print(f'success to get {NCIt_id} page, {NCIt_id} -> {NCBI_id}')
fetched_ids[NCIt_id] = NCBI_id
with open(temp_file, 'w') as file:
json.dump(fetched_ids, file)
return str(NCBI_id)
# 该微生物未收录到NCBI的情况
fetched_ids[NCIt_id] = ""
with open(temp_file, 'w') as file:
json.dump(fetched_ids, file)
print(f'{NCIt_id} maybe not a microbe, or not in NCBI')
return None
else:
# 未检查到微生物的情况
print(f'failed to get {NCIt_id} page')
return None
if __name__ == '__main__':
print(NCIt_to_NCBI('C114246'))
请注意文章时效,2023.12.26