Analysis notebook - Little Science, Big Science, and Beyond

Analysis notebook

This is an analysis of patents data from webscraping WIPO PATENTSCOPE using:
In the main page (https://patentscope.wipo.int/search/en/search.jsf), set “Field” to Publication Date. Then, write the first year of interest in the “Search terms...” section.
In the section right under the search bar, set “Sort” to Relevance and “Per page” to 200.

1Importing libraries¶

import pandas as pd 
import plotly.express as px
import country_converter as coco
import plotly.io as pio
from IPython.display import IFrame
pio.renderers.default = "plotly_mimetype"
cc = coco.CountryConverter()

2Importing data¶

pat = pd.read_csv("../data/amateur-science/pat.csv")

/tmp/ipykernel_22/3675833274.py:1: DtypeWarning:

Columns (4,8) have mixed types. Specify dtype option on import or set low_memory=False.

3Data Processing¶

Adding country names, codes and continents

pat["Full Country"] = cc.pandas_convert(series=pat["Office Country"], to='name_short', not_found = "NaN")

CS not found in ISO2

SU not found in ISO2

DD not found in ISO2

EP not found in ISO2

EA not found in ISO2

AP not found in ISO2

WO not found in ISO2

Function to get patent type

#function
def determine_status(row):
    applicant_types = {'research', 'group', 'society', 'foundation', 'inc', 'compania', 'lab', 'industries', 'societe', 'manufacturing', 'machine', 'co '
                       'corp', 'association', 'university', 'institute', 'company', 'llc', 'ltd', 'lfp', 'industria', 'industrie', 'firm', '+', 'co.',
                       'pharmaceuticals', 'roche', "l'oreal", 'campos', 'technologies', 'corp', 'inst', 'pharma', 'electronics', 'volvo', 'corporation',
                      'ltda', 'communications', 'ifp', 'technik', 'siemens','s.a', 'operations', 'limited', 'gmbh', 'novartis', 'agency',
                      'elektronik', 's.p.a', 'UNIWERSYTET', 's.l', 's.r.l', 'a.s','urs', 'ag ', 'UNIVERSITEIT', 'hospital', 'silverphase',
                      'sanofi', 'science', 'medicament', 'recherche', 'tech', 'international', 'networks', 'france', 'nucleix', 'cosmetique', 
                       'astrazeneca', 'universite', 'les ', 'igt', 'service', 'services', 'univ', 'products', 'product', 'bank', 'compan', 
                      'cotton', '& co', '&co', 'comp', 'constructions', 'meca', 'sciences', 'tech', 'consulting', ' spa', 'management', 'associates', 
                       'holdings', 'systems', ' as', ' co', 'electric', 'printing', 'steel', ' ind', 'chemicals', ' ag', 'a.g', "johnson & johnson"
                      ,'gm. b. h', 'informazioni', 'g. m. b. h.', 'anonyme', 'limitada', 'sociedad', 'solex', 's. a', 'eleuterio', 'societr',
                      'commissariat', 's.a', 'interlight', 's. l', 'electronique', 'moebius & ruppert', 'g m b h', 'elektro', 'società', 'energía',
                      'philips', '&', 's.c.i.', 'société', 'sté', 'g.m.b.h', 'energy', 'a. k','investigación', 'fabrica', 'limited'}
    if pd.notna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor and applicant are both not missing
        if isinstance(row['Inventor'], str) and isinstance(row['Applicant'], str):
            if row['Applicant'] in row['Inventor']:
                return "Solo Inventor"
            elif any(word in row['Applicant'].lower() for word in applicant_types):
                return "Research/Company"
            else:
                return "default"
    elif pd.isna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor is missing but not applicant
        if any(word in row['Applicant'].lower() for word in applicant_types):
            return "Research/Company"
        elif row['Applicant'] == 'applicant name missing':
            return 'default'
        else:
            return "Solo Inventor"
    elif pd.notna(row['Inventor']) and pd.isna(row['Applicant']):#if applicant is missing but not inventor
        # if (row['Year']) < 1920:
            return "Solo Inventor"
        # else: return "default 2"
    else:
        return "default"

Converting to date, applying function to extract patent type, grouping by year and patent type

#convert date string to actual date
pat["Publication date"] = pd.to_datetime(pat["Publication date"], format="%d.%m.%Y")
#converting date to year
pat["Year"] = pat["Publication date"].dt.year
#apply method to get patent type
pat['Type'] = pat.apply(determine_status, axis=1)
pat = pat.groupby(['Year', 'Type', 'Full Country']).size().reset_index(name='Patents')

Getting proportions

#dropping default patents
pat_proportions = pat[pat["Type"] != "default"]
#getting sum of patents per year
total_per_year = pat_proportions.groupby('Year')['Patents'].transform('sum')
#getting proportions of patent types
pat_proportions['Proportion'] = pat_proportions['Patents'] / total_per_year
pat_proportions= pat_proportions.groupby(["Year", "Type"]).sum().reset_index()

/tmp/ipykernel_22/858095603.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

3.1Data Visualization: Stacked Bars¶

fig1 = px.bar(pat_proportions
             , x="Year"
             , y="Proportion"
             , color = "Type"
             , template= "plotly_white"
             , color_discrete_sequence=['#F7C0BB', '#EB5E51']
             ,title="<b>Worldwide Patent Applications Classification</b>"
             , height= 400
             , width = 1100
              , hover_name = "Year"
             , hover_data = ["Patents", "Proportion"]
             , orientation='v')
fig1.update_xaxes(title = None, dtick=5, ticks="outside",ticklen=4,range=[1910,2023]) 
fig1.update_yaxes(title = "Relative Proportion", tickformat= ',.0%')
fig1.update_layout(legend_title = None,
                   font_family="Calibri",
                   font_color="black",
                   title_font_family="Calibri",
                   font=dict(size=14),
                   title_font_color="black") 
annotations = [
      {'text': "Source: WIPO Patentscope", 'showarrow': False, 'x': 0.99, 'y': -0.23, 'xref': 'paper', 'yref': 'paper','font':{'size':11, 'color':"grey"}}
]
for annotation in annotations:
    fig1.add_annotation(annotation)
fig1.show()

IFrame(src='https://janeabdo.github.io/carousel/', width='800', height='700')