Results - Leveraging Large Language Models for Interactive Exploration of MRI Research Reproducibility

# REQUIRED CODE CELL
import os 
import re
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_gradient_magnitude
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

np.seterr(divide = 'ignore') 
DATA_ROOT = "../data/repro-mri-scoping/repro_mri_scoping"

# Purify and merge text
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def write_file(file_path, content):
    with open(file_path, 'w') as file:
        file.write(content)

def remove_enumeration_lines(text):
    # Define the pattern for lines starting with enumeration and ending with a question mark
    pattern = r'^\s*\d+\.\s.*\?$'
    
    # Use re.MULTILINE to apply the pattern to each line in the input text
    result = re.sub(pattern, '', text, flags=re.MULTILINE)
    
    return result

def remove_by_pattern(input_text, patterns_to_remove):
    for pattern in patterns_to_remove:
        input_text = re.sub(pattern, '', input_text)

    return input_text.strip()

def get_output_dir(file_name):
    op_dir = "../output"
    if not os.path.exists(op_dir):
        os.mkdir(op_dir)
    return os.path.join(op_dir,file_name)

directory_path = os.path.join(DATA_ROOT,"repro_insights_parsed_nov23")
input_files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.txt')]

patterns_to_remove = [
    re.compile(r'Questions about the specific reproducible research habit'),
    re.compile(r'Title:'),
    re.compile(r'TLDR:'),
    re.compile(r'Abstract:'),
    re.compile(r'Reproducibility Insights:'),
    re.compile(r'General questions'),
    re.compile(r'Questions about the specific reproducible research habit'),
    re.compile(r'This MRM Reproducible Research Insights interview'),
    re.compile(r'This work was singled out because it demonstrated exemplary reproducible research practices')
]

all_text = ''
for cur_file in input_files:
    cur_content = read_file(cur_file)
    cur_content = remove_enumeration_lines(cur_content)
    cur_content = remove_by_pattern(cur_content, patterns_to_remove)
    all_text = all_text + "\n" + cur_content

<Figure size 640x480 with 1 Axes> — Figure 1:A word cloud generated from the 31 reproducible research insights published by Magnetic Resonance in Medicine Highlights.

<Figure size 1000x1000 with 1 Axes> — Figure 1:A word cloud generated from the 31 reproducible research insights published by Magnetic Resonance in Medicine Highlights.