Leveraging Large Language Models for Interactive Exploration of MRI Research Reproducibility
A Self-Evolving Review
Literature overview
import pandas as pd
import datashader as ds
import datashader.transfer_functions as tf
import datashader.bundling as bd
import matplotlib.pyplot as plt
import colorcet
import matplotlib.colors
import matplotlib.cm
import bokeh.plotting as bpl
import bokeh.transform as btr
import holoviews as hv
import holoviews.operation.datashader as hd
from thefuzz import fuzz
import json
import time
import numpy as np
import umap.plot
import os
import umap.plot
import plotly.graph_objects as go
from plotly.offline import plot
from IPython.display import display, HTML
import base64
import plotly.io as pio
import requests
from PIL import Image
import matplotlib.pyplot as plt
pio.renderers.default = "plotly_mimetype"
# REQUIRED CELL
DATA_ROOT = "../data/repro-mri-scoping/repro_mri_scoping"
np.seterr(divide = 'ignore')
def get_by_id(paper_id):
response = requests.post(
'https://api.semanticscholar.org/graph/v1/paper/batch',
params={'fields': 'abstract,tldr,year,embedding'},
json={"ids": [paper_id]})
if response.status_code == 200:
return response
else:
return None
def get_id(title):
"""
Query Semantic Scholar API by title.
"""
api_url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {"query": title}
response = requests.get(api_url, params=params)
if response.status_code == 200:
result = response.json()
print(result['total'])
for re in result['data']:
print(re)
if fuzz.ratio(re['title'],title) > 90:
return re['paperId']
else:
return None
else:
return None
def bulk_search(query,save_json):
"""
The returns 1000 results per query. If the total number of
hits is larger, the request should be iterated using tokens.
"""
query = "(code | data | open-source | github | jupyter ) + (('MRI' + 'brain') | (MRI + 'neuroimaging')) + reproducib~"
fields = "abstract"
url = f"http://api.semanticscholar.org/graph/v1/paper/search/bulk?query={query}&fields={fields}"
r = requests.get(url).json()
print(f"Found {r['total']} documents")
retrieved = 0
with open(save_json, "a") as file:
while True:
if "data" in r:
retrieved += len(r["data"])
print(f"Retrieved {retrieved} papers...")
for paper in r["data"]:
print(json.dumps(paper), file=file)
if "token" not in r:
break
r = requests.get(f"{url}&token={r['token']}").json()
print(f"Retrieved {retrieved} papers. DONE")
def read_json_file(file_name):
with open(file_name, 'r') as json_file:
json_list = list(json_file)
return json_list
def write_json_file(file_name, dict_content):
with open(file_name, 'w') as json_file:
json_file.write(json.dumps(dict_content))
def get_output_dir(file_name):
op_dir = "../output"
if not os.path.exists(op_dir):
os.mkdir(op_dir)
return os.path.join(op_dir,file_name)
def flatten_dict(input):
result_dict = {}
# Iterate over the list of dictionaries
for cur_dict in input:
# Iterate over key-value pairs in each dictionary
for key, value in cur_dict.items():
# If the key is not in the result dictionary, create a new list
if key not in result_dict:
result_dict[key] = []
# Append the value to the list for the current key
result_dict[key].append(value)
return result_dict
/srv/conda/envs/notebook/lib/python3.10/site-packages/numba/np/ufunc/dufunc.py:343: NumbaWarning: Compilation requested for previously compiled argument types ((uint32,)). This has no effect and perhaps indicates a bug in the calling code (compiling a ufunc more than once for the same signature
warnings.warn(msg, errors.NumbaWarning)
/srv/conda/envs/notebook/lib/python3.10/site-packages/numba/np/ufunc/dufunc.py:343: NumbaWarning: Compilation requested for previously compiled argument types ((uint32,)). This has no effect and perhaps indicates a bug in the calling code (compiling a ufunc more than once for the same signature
warnings.warn(msg, errors.NumbaWarning)
/srv/conda/envs/notebook/lib/python3.10/site-packages/numba/np/ufunc/dufunc.py:343: NumbaWarning: Compilation requested for previously compiled argument types ((uint32,)). This has no effect and perhaps indicates a bug in the calling code (compiling a ufunc more than once for the same signature
warnings.warn(msg, errors.NumbaWarning)
#OPTIONAL CELL
literature_records = get_output_dir("literature_records.json")
search_terms = "(code | data | open-source | github | jupyter ) + (('MRI' + 'brain') | (MRI + 'neuroimaging')) + reproducib~"
# This will save output/literature_records.json
bulk_search(search_terms,literature_records)
Found 1189 documents
Retrieved 999 papers...
Retrieved 1188 papers...
Retrieved 1188 papers. DONE
1Add articles associated with the reproducibility insights¶
Among 1098 articles included in the these Semantic Scholar records, SPECTER vector embeddings 1 were available for 612 articles, representing the publicly accessible content in abstracts and titles. The high-dimensional semantic information captured by the word embeddings was visualized using the uniform manifold approximation and projection method 2.
# REQUIRED CELL
# To load THE ORIGINAL LIST, please comment in the following
lit_list = read_json_file(os.path.join(DATA_ROOT,"literature_records.json"))
# Read the LATEST literature records returned by the above search
# Note that this may include new results (i.e., new articles)
#literature_records = get_output_dir("literature_records.json")
#lit_list = read_json_file(literature_records)
# Collect all the paper IDs from the literature search
lit_ids = [json.loads(entry)['paperId'] for entry in lit_list]
# Get all paper ids for the articles linked to
insights_path = os.path.join(DATA_ROOT,"repro_insights_parsed_nov23")
insights_ids = [f.split(".")[0] for f in os.listdir(insights_path) if f.endswith('.txt')]
# Combine all IDs (unique)
paper_ids_all = list(set(lit_ids + insights_ids))
print(f"Total: {len(paper_ids_all)} papers ")
Total: 1098 papers
The following cell is commented out as it involves a series of API calls that takes some time to complete.
# OPTIONAL CELL
# slices = [(0, 499), (499, 998), (998, None)]
# request_fields = 'title,venue,year,embedding,citationCount'
# results = []
# for start, end in slices:
# print(len(paper_ids_all[start:end]))
# re = requests.post(
# 'https://api.semanticscholar.org/graph/v1/paper/batch',
# params={'fields': request_fields},
# json={"ids": paper_ids_all[start:end]})
# if re.status_code == 200:
# print(f"Got results {start}:{end} interval")
# results.append(re.json())
# time.sleep(15) # Rate limiting.
# else:
# print(f"WARNING slice {start}:{end} did not return results: {re.text}")
# ALTERNATIVE
# The above API call should work fast as the requests are sent in batch.
# However, it frequently throws 429 error. If that's the case, following will
# also work, but takes much longer and a few articles may not be captured.
# results = []
# for cur_id in paper_ids_all:
# #print(len(paper_ids_all[start:end]))
# re = requests.get(
# f'https://api.semanticscholar.org/graph/v1/paper/{cur_id}',
# params={'fields': request_fields})
# if re.status_code == 200:
# results.append(re.json())
# else:
# print(f"WARNING request for {cur_id} could not return results: {re.text}")
# # Write outputs
# write_json_file(get_output_dir("literature_data.json"),results)
While utilizing the Semantic Scholar API, it’s important to note that not all returned article details include word embeddings generated by SPECTER (v1) based on the title and abstract. Currently, we are filtering out articles where embeddings are available, totaling 612 out of 1098. Future efforts may focus on addressing missing data by running the SPECTER model on articles without embeddings.
# REQUIRED CELL
# Load the ORIGINAL data
lit_data = json.loads(read_json_file(os.path.join(DATA_ROOT,"literature_data.json"))[0])
# If you'd like to read from the output directory (LATEST)
#lit_data = json.loads(read_json_file(get_output_dir("literature_data.json"))[0])
papers_data = []
for res in lit_data:
if 'embedding' in res.keys():
if res['embedding']:
cur_rec = {"embedding":res['embedding']['vector'],
"title":res['title'],
"venue": res['venue'],
"year": res['year'],
"is_mrmh": "Other",
"paperId": res['paperId'],
"n_citation": res['citationCount']}
if res['paperId'] in insights_ids:
cur_rec['is_mrmh'] = "Highlights"
papers_data.append(cur_rec)
papers_data_dict = papers_data
# From a list of dicts to a dict of lists.
papers_data = flatten_dict(papers_data)
To visualize the semantic relationship between the captured articles (a 612x768 matrix), we are going to use Uniform Manifold Approximation and Projection (UMAP) method.
# REQUIRED CELL
# Reduce to 2D feature
umap_model_2d = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2,random_state=42)
umap_2d = umap_model_2d.fit_transform(np.array(papers_data['embedding']))
umap_2d_mapper = umap_model_2d.fit(np.array(papers_data['embedding']))
# Reduce to 3D feature
umap_model_3d = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3,random_state=42)
umap_3d = umap_model_3d.fit_transform(np.array(papers_data['embedding']))
/srv/conda/envs/notebook/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning:
n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
/srv/conda/envs/notebook/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning:
n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
Create the connectivity plot to be used as a background image for the interactive data visualization.
# OPTIONAL CELL
ax = umap.plot.connectivity(umap_2d_mapper, edge_bundling='hammer')
# To save with opt size
# width_in_inches = 1200/300
# aspect_ratio = ax.figure.get_size_inches()[1] / ax.figure.get_size_inches()[0]
# height_in_inches = width_in_inches * aspect_ratio
# ax.figure.set_size_inches(width_in_inches, height_in_inches)
ax.set_title("")
ax.axis('off')
ax.figure.savefig("umap_bg_new.png", dpi=300, bbox_inches='tight',edgecolor=None)
plt.close(ax.figure)
/srv/conda/envs/notebook/lib/python3.10/site-packages/umap/plot.py:895: UserWarning:
Hammer edge bundling is expensive for large graphs!
This may take a long time to compute!
The MRI systems cluster was predominantly composed of articles published in MRM, with only two publications appearing in a different journal 34. Additionally, this cluster was sufficiently distinct from the rest of the reproducibility literature, as can be seen by the location of the dark red dots on Fig. 1.
# Create Plotly figure
fig = go.Figure()
# Scatter plot for UMAP in 2D
scatter_2d = go.Scatter(
x=umap_2d[:, 0],
y=umap_2d[:, 1],
mode='markers',
marker = dict(color =["#562135" if item == 'Highlights' else "#f8aabe" for item in papers_data['is_mrmh']],
size=9,
line= dict(color="#ff8080",width=1),
opacity=0.9),
customdata= [f"<b>{dat['title']}</b> <br>{dat['venue']} <br>Cited by: {dat['n_citation']} <br>{dat['year']}" for dat in papers_data_dict],
hovertemplate='%{customdata}',
visible = True,
name='2D'
)
fig.add_trace(scatter_2d)
# Add dropdown
fig.update_layout(
updatemenus=[
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=[{"showscale":True,"marker": dict(color =["#562135" if item == 'Highlights' else "#f8aabe" for item in papers_data['is_mrmh']],
size=9,
line= dict(color="#ff8080",width=1),
opacity=0.9)}],
label="Highlights",
method="restyle"
),
dict(
args=[{"marker": dict(color = np.log(papers_data['n_citation']),colorscale='Plotly3',size=9, colorbar=dict(thickness=10,title = "Citation (log)",orientation="h",len=0.5))}],
label="Citation (log)",
method="restyle"
),
dict(
args=[{"marker": dict(color = papers_data['year'],colorscale='Viridis',size=9,colorbar=dict(thickness=10, orientation="h", len=0.5, title="Year"))}],
label="Year",
method="restyle"
)
]),
pad={"r": 10, "t": 10},
showactive=True,
x=0.11,
xanchor="left",
y=0.98,
yanchor="top"
),
]
)
plotly_logo = base64.b64encode(open(os.path.join('umap_bg.png'), 'rb').read())
# with Image.open(os.path.join('umap_bg.png')) as img:
# bg_width, bg_height = img.size
fig.update_layout(
images=[dict(
source='data:image/png;base64,{}'.format(plotly_logo.decode()),
xref="paper",
yref="paper",
x=0.03, # Start from the left edge
y=0.978, # Start from the top edge
sizex=1, # Stretch to full width
sizey=1, # Stretch to full height
xanchor="left",
yanchor="top",
layer="below"
)]
)
fig.update_layout(yaxis={'visible': False, 'showticklabels': False})
fig.update_layout(xaxis={'visible': False, 'showticklabels': False})
fig.update_layout(
title='Sentient Array of Knowledge Unraveling and Assessment (SAKURA)',
hovermode='closest',
autosize=True,
width=850,
height=850,
paper_bgcolor = "white",
plot_bgcolor = "white"
)
fig.show()
Figure-1: Edge-bundled connectivity of the 612 articles identified by the literature search. A notable cluster (red) is formed by the MRM articles that were featured in the reproducible research insights (purple nodes), particularly in the development of MRI methods. Notable clusters for other studies (pink) are annotated by gray circles.
fig = go.Figure()
# Scatter plot for UMAP in 3D
scatter_3d = go.Scatter3d(
x=umap_3d[:, 0],
y=umap_3d[:, 1],
z=umap_3d[:, 2],
mode='markers',
marker = dict(color =["#562135" if item == 'Highlights' else "#f8aabe" for item in papers_data['is_mrmh']],
size=9,
line= dict(color="#ff8080",width=1),
opacity=0.9),
customdata= [f"<b>{dat['title']}</b> <br>{dat['venue']} <br>Cited by: {dat['n_citation']} <br>{dat['year']}" for dat in papers_data_dict],
hovertemplate='%{customdata}',
visible = True,
name='3D'
)
fig.add_trace(scatter_3d)
fig.update_layout(
updatemenus=[
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=[{"marker": dict(color =["#562135" if item == 'Highlights' else "#f8aabe" for item in papers_data['is_mrmh']],
size=9,
line= dict(color="#ff8080",width=1),
opacity=0.9)}],
label="Highlights",
method="restyle"
),
dict(
args=[{"marker": dict(color = np.log(papers_data['n_citation']),colorscale='Plotly3',size=9,colorbar=dict(thickness=10,orientation="h", len=0.5,title="Citation (log)"))}],
label="Citation (log)",
method="restyle"
),
dict(
args=[{"marker": dict(color = papers_data['year'],colorscale='Viridis',size=9,colorbar=dict(thickness=10, len=0.5, orientation="h", title="Year"))}],
label="Year",
method="restyle"
)
]),
pad={"r": 10, "t": 10},
showactive=True,
x=0.11,
xanchor="left",
y=0.98,
yanchor="top"
),
]
)
# Update layout
fig.update_layout(title='UMAP 3D',
height = 900,
width = 900,
hovermode='closest',
template='plotly_dark')
fig.show()
- Cohan, A., Feldman, S., Beltagy, I., Downey, D., & Weld, D. S. (2020). SPECTER: Document-level Representation Learning using Citation-informed Transformers. 10.18653/v1/2020.acl-main.207
- McInnes, L., Healy, J., & Melville, J. (2018). UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction.
- Adebimpe, A., Bertolero, M., Dolui, S., Cieslak, M., Murtha, K., Baller, E. B., Boeve, B., Boxer, A., Butler, E. R., Cook, P., Colcombe, S., Covitz, S., Davatzikos, C., Davila, D. G., Elliott, M. A., Flounders, M. W., Franco, A. R., Gur, R. E., Gur, R. C., … Satterthwaite, T. D. (2022). ASLPrep: a platform for processing of arterial spin labeled MRI and quantification of regional brain perfusion. Nat. Methods, 19(6), 683–686. 10.1038/s41592-022-01458-7
- Tilea, B., Alberti, C., Adamsbaum, C., Armoogum, P., Oury, J. F., Cabrol, D., Sebag, G., Kalifa, G., & Garel, C. (2009). Cerebral biometry in fetal magnetic resonance imaging: new reference data. Ultrasound Obstet. Gynecol., 33(2), 173–181. 10.1002/uog.6276