AWB (Anwendungsbeobachtungen, Observational Studies)¶
This notebook reads in and cleans the received AWB data and shows some basic analysis.
from __future__ import division
import glob
from datetime import datetime, date, timedelta
import os
import itertools
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame, Series
import pandas as pd
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['svg.fonttype'] = 'none'
pd.options.display.max_rows = 150
filenames = [('data/AWB KBV Meldungen und Abschlüsse 2004 - 2011.xlsx', range(2009, 2012)),
('data/AWB KBV Meldungen und Abschlüsse 2012 - 2014.xlsx', (2012, 2013, 2014))]
def read_excel(filename, years, kind='update', needle='Meldungen'):
xl_file = pd.ExcelFile(filename)
sheet_names = xl_file.sheet_names
for year in years:
sheet_name = [x for x in sheet_names if str(year) in x and needle in x][0]
print(filename, sheet_name)
df = xl_file.parse(sheet_name)
# Make index into row number column
df = df.reset_index()
df['year'] = year
df['row_type'] = kind
df = df.rename(columns=dict([(x, x.strip()) for x in df.columns if x.strip() != x]))
# Consolidate column names
df = df.rename(columns={u'Präparat': u'Präparatname',
u'Präparat/Titel der Anwendung': u'Präparatname',
u'gemeldet am': 'DatumErstanzeige',
u'Datum Erstanzeige': 'DatumErstanzeige',
u'Beobachtugsplan vorliegend': 'Beobachtungsplan vorliegend',
u'Ärzte gemeldet': u'gemeldete Ärzte',
u'Anzahl teilnehmende Ärzte (wenn angegeben)': u'gemeldete Ärzte',
u'Anzahl der beobachtenden Ärzte': u'beobachtende Ärzte',
'index': 'row_number'
})
if 'DatumErstanzeige' not in df.columns:
df = df.rename(columns={'Eingang': 'DatumErstanzeige'})
# Fix date columns
date_cols = list(df.columns[df.columns.str.startswith('Datum')])
for x in date_cols:
df['dt_%s' % x] = pd.to_datetime(df[x], errors='coerce')
df[u'Präparatname'] = df[u'Präparatname'].str.strip()
# Remove entries with empty drug name
df = df[df[u'Präparatname'].notnull()]
yield df
First, read in all available update messages.
num_cols = ['Patienten geplant', 'Patienten beobachtet', u'gemeldete Ärzte', u'beobachtende Ärzte',
u'Vertragsärzte', u'Aufwandsentschädigung pro Patient']
float_cols = [u'Aufwandsentschädigung pro Patient']
df_updates = pd.concat(itertools.chain(*[read_excel(*args) for args in filenames]))
# To be compatible with Abschluesse column
df_updates['Aufwandsentschädigung gesamt in €'] = None
print('Number of rows', len(df_updates))
df_updates.head()
# Catch bogus row that is way down and contains a different header
# Interesting colum "HonorarPlausibilität" (plausibility of fee) which is not available in our dataset
bad_series = df_updates[df_updates['Art der NIS'] == 'Art der NIS'].T.iloc[:,0]
# Remove the row before processing further
df_updates = df_updates[~(df_updates['Art der NIS'] == 'Art der NIS')]
bad_series
Read in all final notices.
abschluesse_df = pd.concat(itertools.chain(*[read_excel(*args, needle='Abschl', kind='final') for args in filenames]))
abschluesse_df
# Make columns compatible with update notices
columns = u'dt_DatumErstanzeige dt_DatumStart dt_DatumEingang Präparatname Wirkstoff Firma Patienten beobachtet Patienten geplant beobachtende Ärzte gemeldete Ärzte Aufwandsentschädigung pro Patient Aufwandsentschädigung Kommentar Aufwandsentschädigung gesamt in € Art der NIS Auftraggeber Beobachtungsplan vorliegend BeobachtungszeitraumKommentar Brief/Mail/Fax DatumAbmeldung DatumBrief DatumEingang DatumEnde Kommentar MeldungsGrund MeldungsKommentar Meldungsart Meldungsinhalt Titel (Ziel) Typ Vertrag vorliegend Vertragsärzte dt_DatumAbmeldung dt_DatumBrief dt_DatumEnde year'.split('\t')
for c in columns:
if c not in abschluesse_df:
abschluesse_df[c] = None
print('Number of rows', len(abschluesse_df))
abschluesse_df.head()
# Same as before, remove bogus header row at line 12607 of final notices in year 2013
bad_series = abschluesse_df[abschluesse_df['Art der NIS'] == 'Art der NIS'].T.iloc[:,0]
# Remove the row before processing further
abschluesse_df = abschluesse_df[~(abschluesse_df['Art der NIS'] == 'Art der NIS')]
bad_series
Get cleaner number representation of total amount.
import re
import numbers
NUMBER_RE = re.compile('^\s*([\d\., ]+)')
NUMBERS_RE = {
re.compile(r'^([\d\.]+),(\d{1,2}]+)'): '.',
re.compile(r'^([\d,]+)\.(\d{1,2}]+)'): ',',
}
def clean_money(x):
if isinstance(x, numbers.Number):
return x
x = NUMBER_RE.sub('\\1', x)
for reg, repl in NUMBERS_RE.items():
m = reg.search(x)
if m is None:
continue
before = int(m.group(1).replace(repl, ''))
after = int(m.group(2))
if after < 10:
after = after / 10.0
else:
after = after / 100.0
return before + after
return None
abschluesse_df['Aufwandsentschädigung gesamt'] = abschluesse_df['Aufwandsentschädigung gesamt in €'].apply(clean_money)
abschluesse_df[['Aufwandsentschädigung gesamt', 'Aufwandsentschädigung gesamt in €']].head()
grouper = ['dt_DatumErstanzeige', 'dt_DatumStart']
# Fill missing values in grouping columns with dummy value,
# so it's not silently dropped by pandas groupby
dummy_date = pd.to_datetime(date(1900, 1, 1))
abschluesse_df[grouper] = abschluesse_df[grouper].fillna(dummy_date)
df_updates[grouper] = df_updates[grouper].fillna(dummy_date)
assert not abschluesse_df[grouper].isnull().any().any()
assert not df_updates[grouper].isnull().any().any()
Combine update notices and final notices.
df_all = pd.concat([df_updates, abschluesse_df])
df_all = df_all.reset_index(drop=True)
df_all['row_type'].value_counts()
# Add simpler version of präparatname that might group better later
DRUG_NAME_SPLITTER = re.compile(r'[^\w ]|\d|_', re.U | re.I)
def clean_praeparat(praeparat):
name = DRUG_NAME_SPLITTER.split(praeparat)[0].strip().lower()
if len(name) < 4:
return praeparat
return name
df_all['praeparat'] = df_all[u'Präparatname'].apply(clean_praeparat)
print('Original Präparatname Number of Groups', len(df_all[u'Präparatname'].value_counts()))
print('Cleaned Präparatname Number of Groups', len(df_all['praeparat'].value_counts()))
for name in num_cols:
new_name = 'num_%s' % name
df_all[new_name] = df_all[name].copy()
df_all[new_name] = df_all[new_name].apply(str)
if name not in float_cols:
df_all[new_name] = (df_all[new_name]
.str.replace('(geplante Anzahl *:?|ca\.|max\.|geplant *:)', '', flags=re.I)
.str.strip()
.str.replace(r'[ ,\.]', '')
)
df_all[new_name] = (df_all[new_name]
.str.replace('^\d+-(\d+)$', '\\1')
)
if name not in float_cols:
df_all[new_name] = (df_all[new_name]
.str.replace(r'^(\d+).*', '\\1', flags=re.I)
)
df_all[new_name] = pd.to_numeric(df_all[new_name], errors='coerce')
# Check if cleaning kind of worked
df_all[df_all['Patienten geplant'].str.contains(' ').fillna(False)][['Patienten geplant', 'num_Patienten geplant']].sample(10)
Analysis¶
Here's some exploratory analysis around the dataset.
First step is to group the single update and final notices into observational studies (AWB). We define the identification of one AWB to be the combination of its drug name, its registration date and its start date.
Per group of notices we find maxmimal numeric values for certain key figures and take the most prominent or last value for other columns.
def get_best_value(series):
vc = series.value_counts()
if len(vc) == 0:
lvi = series.last_valid_index()
if lvi is None:
return None
return series[lvi]
return vc.idxmax()
def get_awbs(groups):
for key, rows in groups:
# Use maximum number across columns and rows for one AWB
patient_count = rows[['num_Patienten beobachtet', 'num_Patienten geplant']].max().max()
doc_count = rows[['num_beobachtende Ärzte', 'num_gemeldete Ärzte']].max().max()
fee_per_patient = rows[[u'num_Aufwandsentschädigung pro Patient']].max().max()
yield pd.DataFrame([{
'praeparat': key[0],
'Präparatname': get_best_value(rows['Präparatname']),
'dt_DatumErstanzeige': key[1],
'dt_Start': key[2],
'patient_count': patient_count,
'doc_count': doc_count,
'fee_per_patient': fee_per_patient,
'calculated_total_fee': fee_per_patient * patient_count,
'fee_comment': get_best_value(rows['Aufwandsentschädigung Kommentar']),
'final_total_fee': rows['Aufwandsentschädigung gesamt'].max(),
# Use most used values across AWB rows
'Auftraggeber': get_best_value(rows['Auftraggeber']),
'Firma': get_best_value(rows['Firma']),
'Wirkstoff': get_best_value(rows['Wirkstoff']),
'dt_DatumEnde': get_best_value(rows['dt_DatumEnde']),
}])
awb_grouper = ['praeparat', 'dt_DatumErstanzeige', 'dt_DatumStart']
groups = df_all.sort_values(['dt_DatumEingang']).groupby(awb_grouper)
df_awb = pd.concat(get_awbs(groups))
df_awb = df_awb.reset_index(drop=True)
df_awb.head()
Number of extracted AWBs
len(df_awb)
Numbers of patients¶
df_awb['patient_count'].sum()
df_awb['patient_count'].describe()
Numbers of doctors¶
df_awb['doc_count'].sum()
df_awb['doc_count'].describe()
Fee per Patient¶
df_awb['fee_per_patient'].describe()
df_awb['final_total_fee'].describe()
How many AWBs over the years?¶
awbs_per_year = df_awb.groupby(df_awb.dt_DatumErstanzeige.dt.year).size()
# Drop stupid values
awbs_per_year = awbs_per_year.drop([1900, 1905])
awbs_per_year
awbs_per_year.plot()
How many patients were in the studies over the years?¶
patients_per_year = df_awb.groupby(df_awb.dt_Start.dt.year)['patient_count'].sum()
patients_per_year
patients_per_year.plot(kind='bar')
How many doctors participated over the years?¶
doctors_per_year = df_awb.groupby(df_awb.dt_Start.dt.year)['doc_count'].sum()
doctors_per_year
doctors_per_year.plot(kind='bar')
awbv = ['Auftraggeber', 'Firma', 'Präparatname', 'doc_count', 'patient_count', 'final_total_fee', 'fee_per_patient',
'fee_comment', 'calculated_total_fee', 'praeparat', 'Wirkstoff', 'dt_DatumEnde', 'dt_DatumErstanzeige',
'dt_Start']
Highest patient count¶
df_awb.sort_values('patient_count', ascending=False).head(10)[awbv]
Highest fee per patient¶
df_awb.sort_values('fee_per_patient', ascending=False).head(10)[awbv]
Highest calculated total fee (fee per patient times number of patients)¶
df_awb.sort_values('calculated_total_fee', ascending=False).head(10)[awbv]
Highest total final costs per AWB¶
df_awb.sort_values('final_total_fee', ascending=False).head(10)[awbv]
MabThera AWB¶
v = ['dt_DatumEingang', 'Präparatname', 'Auftraggeber', 'Patienten beobachtet', 'Patienten geplant',
'Aufwandsentschädigung pro Patient', 'Aufwandsentschädigung gesamt in €', 'row_type', 'year', 'row_number']
df_all[(df_all['Präparatname'] == 'MabThera') & (df_all['dt_DatumErstanzeige'].dt.year == 2009) & (df_all['dt_DatumErstanzeige'].dt.month == 5)][v]
df_awb[df_awb['Präparatname'] == 'MabThera'][awbv]
Difference between calculated total fee and final total fee¶
Top 10 where final total is higher than calculated total costs.
df_awb['final_calculated_diff'] = df_awb['calculated_total_fee'] - df_awb['final_total_fee']
df_awb.sort_values('final_calculated_diff').head(10)[['final_calculated_diff'] + awbv]
Top 10 where final total is lower than calculated total costs.
df_awb.sort_values('final_calculated_diff', ascending=False).head(10)[['final_calculated_diff'] + awbv]
Rough Top 10 final total fee by Firma
df_awb.groupby('Firma')['final_total_fee'].sum().sort_values(ascending=False).head(10)
Analysis of fee comments¶
"Patient independent payments" and other interesting bits can be found.
pd.set_option('max_colwidth', 120)
v = ['Präparatname', 'Auftraggeber', 'Patienten geplant', 'Aufwandsentschädigung pro Patient', 'Aufwandsentschädigung gesamt in €']
abschluesse_df[abschluesse_df['Aufwandsentschädigung gesamt in €'].str.contains('unabh').fillna(False)][v]
df_all[df_all['Typ'] == 'Nahrungsergänzungsmittel'][v]