Leveraging Large Language Models for Interactive Exploration of MRI Research Reproducibility
A Self-Evolving Review
Results
# REQUIRED CODE CELL
import os
import re
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
np.seterr(divide = 'ignore')
DATA_ROOT = "../data/repro-mri-scoping/repro_mri_scoping"
# Purify and merge text
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def write_file(file_path, content):
with open(file_path, 'w') as file:
file.write(content)
def remove_enumeration_lines(text):
# Define the pattern for lines starting with enumeration and ending with a question mark
pattern = r'^\s*\d+\.\s.*\?$'
# Use re.MULTILINE to apply the pattern to each line in the input text
result = re.sub(pattern, '', text, flags=re.MULTILINE)
return result
def remove_by_pattern(input_text, patterns_to_remove):
for pattern in patterns_to_remove:
input_text = re.sub(pattern, '', input_text)
return input_text.strip()
def get_output_dir(file_name):
op_dir = "../output"
if not os.path.exists(op_dir):
os.mkdir(op_dir)
return os.path.join(op_dir,file_name)
directory_path = os.path.join(DATA_ROOT,"repro_insights_parsed_nov23")
input_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.txt')]
patterns_to_remove = [
re.compile(r'Questions about the specific reproducible research habit'),
re.compile(r'Title:'),
re.compile(r'TLDR:'),
re.compile(r'Abstract:'),
re.compile(r'Reproducibility Insights:'),
re.compile(r'General questions'),
re.compile(r'Questions about the specific reproducible research habit'),
re.compile(r'This MRM Reproducible Research Insights interview'),
re.compile(r'This work was singled out because it demonstrated exemplary reproducible research practices')
]
all_text = ''
for cur_file in input_files:
cur_content = read_file(cur_file)
cur_content = remove_enumeration_lines(cur_content)
cur_content = remove_by_pattern(cur_content, patterns_to_remove)
all_text = all_text + "\n" + cur_content
text = all_text
stopwords = set(STOPWORDS)
stopwords.add("will")
stopwords.add("you")
stopwords.add("others")
stopwords.add("people")
stopwords.add("using")
stopwords.add("By")
stopwords.add("Mathieu Boudreau")
stopwords.add("Agah Karakuzu")
stopwords.add("Pinar S. Ozbay")
brain_color = np.array(Image.open(os.path.join(DATA_ROOT, "brain_image.png")))
brain_color = brain_color[::3, ::3]
brain_mask = brain_color.copy()
brain_mask[brain_mask.sum(axis=2) == 0] = 255
# some finesse: we enforce boundaries between colors so they get less washed out.
# For that we do some edge detection in the image
edges = np.mean([gaussian_gradient_magnitude(brain_color[:, :, i] / 255., 2) for i in range(3)], axis=0)
brain_mask[edges > .08] = 255
# create wordcloud. A bit sluggish, you can subsample more strongly for quicker rendering
# relative_scaling=0 means the frequencies in the data are reflected less
# acurately but it makes a better picture
wc = WordCloud(max_words=2000, mask=brain_mask, max_font_size=50, min_font_size=5, random_state=42, relative_scaling=0.2, stopwords=stopwords)
# generate word cloud
wc.generate(text)
plt.imshow(wc)
# create coloring from image
image_colors = ImageColorGenerator(brain_color)
wc.recolor(color_func=image_colors)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation="bilinear")
wc.to_file(get_output_dir("brain_wordcloud.png"))
<wordcloud.wordcloud.WordCloud at 0x711f68bfc4c0>


FigureĀ 1:A word cloud generated from the 31 reproducible research insights published by Magnetic Resonance in Medicine Highlights.