Little Science, Big Science, and Beyond
How Amateurs Shape the Scientific Landscape
Analysis notebook
This is an analysis of patents data from webscraping WIPO PATENTSCOPE using:
In the main page (https://
In the section right under the search bar, set “Sort” to Relevance and “Per page” to 200.
1Importing libraries¶
import pandas as pd
import plotly.express as px
import country_converter as coco
import plotly.io as pio
from IPython.display import IFrame
pio.renderers.default = "plotly_mimetype"
cc = coco.CountryConverter()
2Importing data¶
pat = pd.read_csv("../data/amateur-science/pat.csv")
/tmp/ipykernel_22/3675833274.py:1: DtypeWarning:
Columns (4,8) have mixed types. Specify dtype option on import or set low_memory=False.
3Data Processing¶
Adding country names, codes and continents
pat["Full Country"] = cc.pandas_convert(series=pat["Office Country"], to='name_short', not_found = "NaN")
CS not found in ISO2
SU not found in ISO2
DD not found in ISO2
EP not found in ISO2
EA not found in ISO2
AP not found in ISO2
WO not found in ISO2
Function to get patent type
#function
def determine_status(row):
applicant_types = {'research', 'group', 'society', 'foundation', 'inc', 'compania', 'lab', 'industries', 'societe', 'manufacturing', 'machine', 'co '
'corp', 'association', 'university', 'institute', 'company', 'llc', 'ltd', 'lfp', 'industria', 'industrie', 'firm', '+', 'co.',
'pharmaceuticals', 'roche', "l'oreal", 'campos', 'technologies', 'corp', 'inst', 'pharma', 'electronics', 'volvo', 'corporation',
'ltda', 'communications', 'ifp', 'technik', 'siemens','s.a', 'operations', 'limited', 'gmbh', 'novartis', 'agency',
'elektronik', 's.p.a', 'UNIWERSYTET', 's.l', 's.r.l', 'a.s','urs', 'ag ', 'UNIVERSITEIT', 'hospital', 'silverphase',
'sanofi', 'science', 'medicament', 'recherche', 'tech', 'international', 'networks', 'france', 'nucleix', 'cosmetique',
'astrazeneca', 'universite', 'les ', 'igt', 'service', 'services', 'univ', 'products', 'product', 'bank', 'compan',
'cotton', '& co', '&co', 'comp', 'constructions', 'meca', 'sciences', 'tech', 'consulting', ' spa', 'management', 'associates',
'holdings', 'systems', ' as', ' co', 'electric', 'printing', 'steel', ' ind', 'chemicals', ' ag', 'a.g', "johnson & johnson"
,'gm. b. h', 'informazioni', 'g. m. b. h.', 'anonyme', 'limitada', 'sociedad', 'solex', 's. a', 'eleuterio', 'societr',
'commissariat', 's.a', 'interlight', 's. l', 'electronique', 'moebius & ruppert', 'g m b h', 'elektro', 'società', 'energía',
'philips', '&', 's.c.i.', 'société', 'sté', 'g.m.b.h', 'energy', 'a. k','investigación', 'fabrica', 'limited'}
if pd.notna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor and applicant are both not missing
if isinstance(row['Inventor'], str) and isinstance(row['Applicant'], str):
if row['Applicant'] in row['Inventor']:
return "Solo Inventor"
elif any(word in row['Applicant'].lower() for word in applicant_types):
return "Research/Company"
else:
return "default"
elif pd.isna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor is missing but not applicant
if any(word in row['Applicant'].lower() for word in applicant_types):
return "Research/Company"
elif row['Applicant'] == 'applicant name missing':
return 'default'
else:
return "Solo Inventor"
elif pd.notna(row['Inventor']) and pd.isna(row['Applicant']):#if applicant is missing but not inventor
# if (row['Year']) < 1920:
return "Solo Inventor"
# else: return "default 2"
else:
return "default"
Converting to date, applying function to extract patent type, grouping by year and patent type
#convert date string to actual date
pat["Publication date"] = pd.to_datetime(pat["Publication date"], format="%d.%m.%Y")
#converting date to year
pat["Year"] = pat["Publication date"].dt.year
#apply method to get patent type
pat['Type'] = pat.apply(determine_status, axis=1)
pat = pat.groupby(['Year', 'Type', 'Full Country']).size().reset_index(name='Patents')
Getting proportions
#dropping default patents
pat_proportions = pat[pat["Type"] != "default"]
#getting sum of patents per year
total_per_year = pat_proportions.groupby('Year')['Patents'].transform('sum')
#getting proportions of patent types
pat_proportions['Proportion'] = pat_proportions['Patents'] / total_per_year
pat_proportions= pat_proportions.groupby(["Year", "Type"]).sum().reset_index()
/tmp/ipykernel_22/858095603.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
3.1Data Visualization: Stacked Bars¶
fig1 = px.bar(pat_proportions
, x="Year"
, y="Proportion"
, color = "Type"
, template= "plotly_white"
, color_discrete_sequence=['#F7C0BB', '#EB5E51']
,title="<b>Worldwide Patent Applications Classification</b>"
, height= 400
, width = 1100
, hover_name = "Year"
, hover_data = ["Patents", "Proportion"]
, orientation='v')
fig1.update_xaxes(title = None, dtick=5, ticks="outside",ticklen=4,range=[1910,2023])
fig1.update_yaxes(title = "Relative Proportion", tickformat= ',.0%')
fig1.update_layout(legend_title = None,
font_family="Calibri",
font_color="black",
title_font_family="Calibri",
font=dict(size=14),
title_font_color="black")
annotations = [
{'text': "Source: WIPO Patentscope", 'showarrow': False, 'x': 0.99, 'y': -0.23, 'xref': 'paper', 'yref': 'paper','font':{'size':11, 'color':"grey"}}
]
for annotation in annotations:
fig1.add_annotation(annotation)
fig1.show()
Loading...
IFrame(src='https://janeabdo.github.io/carousel/', width='800', height='700')
Loading...