ELRC, European Language Resource Coordination
Documents published on the European Parliament's official website
Available for Download ✅
⚠️ Always check the license of the data source before using the data ⚠️
- Main page: https://elrc-share.eu/
- Data Browse Link: https://elrc-share.eu/repository/search/
- Format: .tmx
metadata, df = tmx2dataframe.read('elrc/citizens_information_en-ga.tmx')
print(len(df))
df.head()
lang='ga'            
dir_path = Path(f'elrc') 
samp_count=0
for f in progress_bar(list(dir_path.iterdir())): 
    if f.suffix == '.tmx': 
        try:
            _, df = tmx2dataframe.read(str(f))
            # If target_language in dataframe contains the language string (like 'ga')
            df.target_language = df.target_language.str.lower()
            if len(df[df.target_language.str.contains(lang)]) > 0:
                ga_df = df[df.target_language.str.contains(lang)].copy()
                ga_df['filepath'] = str(f)
        except:pass
            #print(f"Couldn't open {f}") 
        var_exists = 'ga_df' in locals() or 'ga_df' in globals()
        if var_exists:
            #print(f'{len(ga_df)} samples found in {f}')
            samp_count+=len(ga_df)
            ga_df.reset_index(inplace=True, drop=True)
            ga_df.to_csv(f'{str(f).lower()}.csv')
            del ga_df
            gc.collect()
        #else: print(f'No {lang} text found in {f} ?')
        #print()
        
print(f'{samp_count} total text samples extracted')
lang='ga'            
dir_path = Path(f'elrc') 
f_list = []
for f in list(dir_path.iterdir()):
    if f.suffix == '.csv': f_list.append(f)
for i,f in enumerate(progress_bar(f_list)): 
    try:
        if i == 0: ga_df = pd.read_csv(f, index_col=0)
        tmp = pd.read_csv(f, index_col=0)
        ga_df = pd.concat([ga_df, tmp])
    except:
        print(f'Error with opening {f}')
ga_df.reset_index(inplace=True, drop=True)
print(len(ga_df))
ga_df.to_csv('elrc_en-ga_compiled_2020-06-11.csv', index=False)
ga_df.head()
Number source documents:
Number of lines per source document: