Skip to article content

Leveraging Large Language Models for Interactive Exploration of MRI Research Reproducibility

A Self-Evolving Review

Results

# REQUIRED CODE CELL
import os 
import re
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

np.seterr(divide = 'ignore') 
DATA_ROOT = "../data/repro-mri-scoping/repro_mri_scoping"

# Purify and merge text
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def write_file(file_path, content):
    with open(file_path, 'w') as file:
        file.write(content)

def remove_enumeration_lines(text):
    # Define the pattern for lines starting with enumeration and ending with a question mark
    pattern = r'^\s*\d+\.\s.*\?$'
    
    # Use re.MULTILINE to apply the pattern to each line in the input text
    result = re.sub(pattern, '', text, flags=re.MULTILINE)
    
    return result

def remove_by_pattern(input_text, patterns_to_remove):
    for pattern in patterns_to_remove:
        input_text = re.sub(pattern, '', input_text)

    return input_text.strip()

def get_output_dir(file_name):
    op_dir = "../output"
    if not os.path.exists(op_dir):
        os.mkdir(op_dir)
    return os.path.join(op_dir,file_name)
directory_path = os.path.join(DATA_ROOT,"repro_insights_parsed_nov23")
input_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.txt')]

patterns_to_remove = [
    re.compile(r'Questions about the specific reproducible research habit'),
    re.compile(r'Title:'),
    re.compile(r'TLDR:'),
    re.compile(r'Abstract:'),
    re.compile(r'Reproducibility Insights:'),
    re.compile(r'General questions'),
    re.compile(r'Questions about the specific reproducible research habit'),
    re.compile(r'This MRM Reproducible Research Insights interview'),
    re.compile(r'This work was singled out because it demonstrated exemplary reproducible research practices')
]

all_text = ''
for cur_file in input_files:
    cur_content = read_file(cur_file)
    cur_content = remove_enumeration_lines(cur_content)
    cur_content = remove_by_pattern(cur_content, patterns_to_remove)
    all_text = all_text + "\n" + cur_content
text = all_text

stopwords = set(STOPWORDS)
stopwords.add("will")
stopwords.add("you")
stopwords.add("others")
stopwords.add("people")
stopwords.add("using")
stopwords.add("By")
stopwords.add("Mathieu Boudreau")
stopwords.add("Agah Karakuzu")
stopwords.add("Pinar S. Ozbay")

brain_color = np.array(Image.open(os.path.join(DATA_ROOT, "brain_image.png")))

brain_color = brain_color[::3, ::3]

brain_mask = brain_color.copy()
brain_mask[brain_mask.sum(axis=2) == 0] = 255

# some finesse: we enforce boundaries between colors so they get less washed out.
# For that we do some edge detection in the image
edges = np.mean([gaussian_gradient_magnitude(brain_color[:, :, i] / 255., 2) for i in range(3)], axis=0)
brain_mask[edges > .08] = 255

# create wordcloud. A bit sluggish, you can subsample more strongly for quicker rendering
# relative_scaling=0 means the frequencies in the data are reflected less
# acurately but it makes a better picture
wc = WordCloud(max_words=2000, mask=brain_mask, max_font_size=50, min_font_size=5, random_state=42, relative_scaling=0.2, stopwords=stopwords)

# generate word cloud
wc.generate(text)
plt.imshow(wc)

# create coloring from image
image_colors = ImageColorGenerator(brain_color)
wc.recolor(color_func=image_colors)
plt.figure(figsize=(10, 10))
plt.imshow(wc, interpolation="bilinear")
wc.to_file(get_output_dir("brain_wordcloud.png"))
<wordcloud.wordcloud.WordCloud at 0x711f68bfc4c0>
<Figure size 640x480 with 1 Axes><Figure size 1000x1000 with 1 Axes>

FigureĀ 1:A word cloud generated from the 31 reproducible research insights published by Magnetic Resonance in Medicine Highlights.

Leveraging Large Language Models for Interactive Exploration of MRI Research Reproducibility
Creating a knowledge base for a custom GPT