Coverage for base-mlguessimporter\src\mlguessimporter\import_fns.py: 98%
376 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-18 23:30 -0700
« prev ^ index » next coverage.py v7.6.1, created at 2024-09-18 23:30 -0700
1import json
2import pandas as pd
3from typing import Optional
4import urllib.request
5import urllib.parse
6import certifi
7import ssl
9from ucimlrepo.dotdict import dotdict
10from ucimlrepo.fetch import DatasetNotFoundError
12import traceback
14# API endpoints
15API_BASE_URL = 'https://archive.ics.uci.edu/api/dataset'
16API_LIST_URL = 'https://archive.ics.uci.edu/api/datasets/list'
18# base location of data csv files
19DATASET_FILE_BASE_URL = 'https://archive.ics.uci.edu/static/public'
21# available categories of datasets to filter by
22VALID_FILTERS = ['aim-ahead']
24import re
26def fetch_ucirepo(
27 name: Optional[str] = None,
28 id: Optional[int] = None
29 ):
30 '''
31 Loads a dataset from the UCI ML Repository, including the dataframes and metadata information.
33 Parameters:
34 id (int): Dataset ID for UCI ML Repository
35 name (str): Dataset name, or substring of name
36 (Only provide id or name, not both)
38 Returns:
39 result (dotdict): object containing dataset metadata, dataframes, and variable info in its properties
40 '''
42 # check that only one argument is provided
43 if name and id:
44 raise ValueError('Only specify either dataset name or ID, not both')
46 # validate types of arguments and add them to the endpoint query string
47 api_url = API_BASE_URL
48 if name:
49 if type(name) != str:
50 raise ValueError('Name must be a string')
51 api_url += '?name=' + urllib.parse.quote(name)
52 elif id:
53 if type(id) != int:
54 raise ValueError('ID must be an integer')
55 api_url += '?id=' + str(id)
56 else:
57 # no arguments provided
58 raise ValueError('Must provide a dataset name or ID')
61 # fetch metadata from API
62 data = None
63 #try:
64 response = urllib.request.urlopen(api_url, context=ssl.create_default_context(cafile=certifi.where()))
65 data = json.load(response)
66 #except (urllib.error.URLError, urllib.error.HTTPError):
67 # raise ConnectionError('Error connecting to server')
69 # verify that dataset exists
70 if data['status'] != 200:
71 error_msg = data['message'] if 'message' in data else 'Dataset not found in repository'
72 raise DatasetNotFoundError(error_msg)
75 # extract ID, name, and URL from metadata
76 metadata = data['data']
77 if not id:
78 id = metadata['uci_id']
79 elif not name:
80 name = metadata['name']
82 data_url = metadata['data_url']
84 # no data URL means that the dataset cannot be imported into Python
85 # i.e. it does not yet have a standardized CSV file for pandas to parse
86 if not data_url:
88 if id==132 or name=='movie':
89 return ('custom', special_case_download('movie'))
90 elif id==34 or name=='diabetes':
91 return ('custom', special_case_download('diabetes'))
92 elif id==137 or name=='reuters21578' or name=='reuters+21578+text+categorization+collection':
93 return ('custom', special_case_download('reuters'))
94 elif id==121 or name=='eeg+database':
95 return ('custom', special_case_download('eeg'))
97 data = get_data(id, name)
99 if data is None:
100 raise DatasetNotFoundError('"{}" dataset (id={}) exists in the repository, but is not available for import. Please select a dataset from this list: https://archive.ics.uci.edu/datasets?skip=0&take=10&sort=desc&orderBy=NumHits&search=&Python=true'.format(name, id))
102 return ('custom', data)
104 # parse into dataframe using pandas
105 df = None
106 #try:
107 df = pd.read_csv(data_url)
108 #except (urllib.error.URLError, urllib.error.HTTPError):
109 # raise DatasetNotFoundError('Error reading data csv file for "{}" dataset (id={}).'.format(name, id))
110 #except Exception as e:
111 # df = pd.read_excel(data_url, engine='openpyxl')
113 #if df.empty:
114 # raise DatasetNotFoundError('Error reading data csv file for "{}" dataset (id={}).'.format(name, id))
117 # header line should be variable names
118 headers = df.columns
120 # feature information, class labels
121 variables = metadata['variables']
122 del metadata['variables'] # moved from metadata to a separate property
124 # organize variables into IDs, features, or targets
125 variables_by_role = {
126 'ID': [],
127 'Feature': [],
128 'Target': [],
129 'Other': []
130 }
131 for variable in variables:
132 if variable['role'] not in variables_by_role:
133 raise ValueError('Role must be one of "ID", "Feature", or "Target", or "Other"')
134 variables_by_role[variable['role']].append(variable['name'])
136 # extract dataframes for each variable role
137 ids_df = df[variables_by_role['ID']] if len(variables_by_role['ID']) > 0 else None
138 features_df = df[variables_by_role['Feature']] if len(variables_by_role['Feature']) > 0 else None
139 targets_df = df[variables_by_role['Target']] if len(variables_by_role['Target']) > 0 else None
141 # place all varieties of dataframes in data object
142 data = {
143 'ids': ids_df,
144 'features': features_df,
145 'targets': targets_df,
146 'original': df,
147 'headers': headers,
148 }
150 # convert variables from JSON structure to tabular structure for easier visualization
151 variables = pd.DataFrame.from_records(variables)
153 # alternative usage?:
154 # variables.age.role or variables.slope.description
155 # print(variables) -> json-like dict with keys [name] -> details
157 # make nested metadata fields accessible via dot notation
158 metadata['additional_info'] = dotdict(metadata['additional_info']) if metadata['additional_info'] else None
159 metadata['intro_paper'] = dotdict(metadata['intro_paper']) if metadata['intro_paper'] else None
161 # construct result object
162 result = {
163 'data': dotdict(data),
164 'metadata': dotdict(metadata),
165 'variables': variables
166 }
168 # convert to dictionary with dot notation
169 return ('uci', dotdict(result))
171import requests
172from bs4 import BeautifulSoup
173import os
174from urllib.parse import urljoin
175import warnings
177def get_data(id, name):
178 # URL of the website you want to scrape
179 url = f'https://archive.ics.uci.edu/dataset/{id}/{name}'
181 # Send a GET request to the website
182 response = requests.get(url)
184 # Check if the request was successful
185 if response.status_code == 200:
186 # Parse the HTML content using BeautifulSoup
187 soup = BeautifulSoup(response.content, 'html.parser')
189 # Find the anchor tag with the text "Download"
190 download_link = soup.find_all('a')
191 download_link = list(filter(lambda tag: tag.find('span', string='Download'), download_link))
193 if download_link:
194 download_link = download_link[0]
195 # Get the href attribute (the URL the link points to)
196 download_url = download_link['href']
197 print('Download link found:', download_url)
198 full_download_url = urljoin(url, download_url)
199 print('Download link:', full_download_url)
201 file_name = os.path.basename(full_download_url)
202 file_path = os.path.join(os.getcwd(), file_name)
204 clear_extract_folder()
206 if os.path.exists(file_path):
207 warnings.warn('Using locally downloaded archive, since downloading can be very slow...')
208 return extract_and_get_path(file_path)
210 # Send a GET request to the download URL
211 download_file(full_download_url, file_path)
213 print(f'File downloaded and saved as: {file_path}')
215 return extract_and_get_path(file_path)
216 else:
217 print('No download link found.')
218 else:
219 print(f'Failed to retrieve the webpage. Status code: {response.status_code}')
221def clear_extract_folder():
222 import os, shutil
224 folder = os.path.join(os.getcwd(), 'extracted_files')
226 if not os.path.exists(folder):
227 os.makedirs(folder)
228 return
230 for root, dirs, files in os.walk(folder):
231 try:
232 for file in files:
233 file_path = os.path.join(root, file)
234 os.chmod(file_path, 0o777) # Ensure the file is writable
235 os.remove(file_path) # Remove the file
237 for dir in dirs:
238 dir_path = os.path.join(root, dir)
239 os.chmod(dir_path, 0o777) # Ensure the directory is writable
240 shutil.rmtree(dir_path) # Remove the directory
241 except Exception as e:
242 print('Failed to delete %s. Reason: %s' % (file_path, e))
244import zipfile
245import tarfile
246import gzip
248def extract_and_get_path(file_path):
249 """
250 Recursively extract nested archives until we get a file
251 """
252 extract_path = os.path.join(os.getcwd(), 'extracted_files')
254 # Extract the archive
255 name_list = extract_archive(file_path, extract_path)
256 # print('name list:', name_list)
257 extracted_files = [os.path.join(extract_path, file) for file in name_list]
259 #print('Extracted files:', extracted_files)
261 archives = []
262 text_files = []
263 extensionless_files = []
264 table_files = []
266 for file in extracted_files:
267 #print(f'Extracted file: {file}')
268 if file.endswith('.data'):
269 print(f'Returning {file}')
270 table_files.append(file)
271 #return file, False
272 if file.endswith('.xlsx'):
273 print(f'Returning {file}')
274 table_files.append(file)
275 #return file, False
276 if file.endswith('.csv'):
277 print(f'Returning {file}')
278 table_files.append(file)
279 #return file, True
280 if is_archive(file):
281 archives.append(file)
282 if '.txt' in file:
283 text_files.append(file)
284 if '.' not in file:
285 extensionless_files.append(file)
287 if len(table_files) > 0:
288 ret = {}
290 for f in table_files:
291 basename = os.path.basename(f)
292 try:
293 df = read_csv(f)
294 except Exception as e:
295 df = pd.read_excel(f, engine='openpyxl')
297 ret[basename] = df
299 return ret
302 print('Did not find any tabular/CSV data. Trying archives:', archives)
304 ret = None
306 import editdistance
307 from functools import cmp_to_key
309 def dist(item):
310 return min(editdistance.eval(item, 'data'), editdistance.eval(item, file_path))
312 def compare(item1, item2):
313 return dist(item1) - dist(item2)
315 sorted(archives, key=cmp_to_key(compare))
317 for a in archives:
318 print('trying this archive:', a)
319 res = extract_and_get_path(a)
320 if res is not None:
321 return res
322 else:
323 if ret is None:
324 ret = []
325 print('trying JSON')
326 res = try_json_extraction(a)
327 if res is not None:
328 ret.append(res)
330 # try any txt files as CSV file
331 print('trying txt files')
332 delimiters = [None, ',', ';', '\t']
333 backup = {}
334 nasum = 1
335 for t in text_files:
336 for d in delimiters:
337 try:
339 df = read_csv(t, header=None, delimiter=d)
341 # print('imported successfully,', df.shape)
343 this_nasum = df.isna().sum().sum() / df.size
344 if df.shape[1] == 1 or this_nasum > 0:
345 # print('only one column, save as backup')
346 if this_nasum < nasum:
347 backup = {os.path.basename(t): df}
348 nasum = this_nasum
349 elif abs(this_nasum - nasum) < 0.01:
350 backup[os.path.basename(t)] = df
351 else:
352 # print('actually dont save due to nans (', this_nasum, '>', nasum, ')')
353 pass
354 continue
356 return df
357 except Exception as e:
358 #print(e)
359 pass
361 print('trying files with no extension')
362 for t in extensionless_files:
363 for d in delimiters:
364 try:
365 df = read_csv(t, header=None, delimiter=d)
367 #print('imported successfully,', df.shape)
369 this_nasum = df.isna().sum().sum() / df.size
370 if df.shape[1] == 1 or this_nasum > 0:
371 #print('only one column, save as backup')
372 if this_nasum < nasum:
373 backup = {os.path.basename(t): df}
374 nasum = this_nasum
375 else:
376 #print('actually dont save due to nans (', this_nasum, '>', nasum, ')')
377 pass
378 continue
380 return df
381 except Exception as e:
382 pass
384 # print('this is our backup', backup)
385 # print('backup =?= {}', backup == {})
387 if (not backup == {}) and (backup is not None):
388 print('returning backup')
389 return backup
391 return ret
393def extract_archive(file_path, extract_to):
394 """Extract various types of archive files."""
395 if file_path.endswith('.zip'):
396 with zipfile.ZipFile(file_path, 'r') as zip_ref:
397 zip_ref.extractall(extract_to)
398 return zip_ref.namelist()
399 elif file_path.endswith('.tar') or file_path.endswith('.tar.gz') or file_path.endswith('.tgz'):
400 tar = tarfile.open(file_path)
401 tar.extractall(path=extract_to)#, filter='data')
402 retlist = [t.name for t in tar.getmembers()]
403 #print(retlist)
404 tar.close()
405 return retlist
406 elif file_path.endswith('.gz'):
407 print('handle .gz files')
408 # .gz files usually contain a single file, so extract that
409 with gzip.open(file_path, 'rb') as gz_ref:
410 file_content = gz_ref.read()
411 output_file_path = os.path.join(extract_to, os.path.basename(file_path).replace('.gz', ''))
412 with open(output_file_path, 'wb') as out_file:
413 out_file.write(file_content)
414 os.chmod(output_file_path, 0o755)
415 return [output_file_path]
416 elif file_path.endswith('.tar.Z'):
417 import unlzw3
418 from pathlib import Path
419 uncompressed_data = unlzw3.unlzw(Path(file_path).read_bytes())
420 inner_file = file_path[:-2]
421 with open(inner_file, 'wb') as file:
422 file.write(uncompressed_data)
424 if is_archive(inner_file):
425 return extract_archive(inner_file, extract_to)
426 else:
427 return [inner_file]
428 else:
429 raise ValueError("Unsupported file format:", file_path)
431def is_archive(file_path):
432 archive_extensions = ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.z', '.tar.z']
433 #print(file_path, any(file_path.lower().endswith(ext) for ext in archive_extensions))
434 return any(file_path.lower().endswith(ext) for ext in archive_extensions)
436def read_csv(file, header=None, delimiter=None):
437 largest_col_cnt = 0
439 with open(file, 'r') as temp_f:
440 lines = temp_f.readlines()
442 for l in lines:
443 col_cnt = len(l.split(delimiter)) + 1
445 largest_col_cnt = max(largest_col_cnt, col_cnt)
447 col_names = [i for i in range(0, largest_col_cnt)]
449 return pd.read_csv(file, header=None, delimiter=delimiter, names=col_names)
451def try_json_extraction(archive, name_list=None):
452 """
453 Try to import dataset as a JSON, where each file
454 """
456 extract_path = os.path.join(os.getcwd(), 'extracted_files')
458 # Extract the archive
459 if name_list is None:
460 name_list = extract_archive(archive, extract_path)
462 extracted_files = [os.path.join(extract_path, file) for file in name_list]
464 print('Attempting CSV import...')
466 from pathlib import Path
468 start_folder = os.path.commonprefix(extracted_files)
469 extracted_files = [s[len(start_folder):] for s in extracted_files]
471 max_depth = 0
473 for e in extracted_files:
474 depth = len(Path(e).parents)
476 max_depth = max(max_depth, depth)
478 def read(file):
479 try:
480 with open(file, 'r') as f:
481 # Read the entire content of the file
482 content = f.read()
484 return content
485 except:
486 #print('had to skip', file)
487 return ""
489 if max_depth <= 2:
490 data = dict()
492 for f in extracted_files:
493 if f == '':
494 continue
495 f = f[1:]
497 parts = os.path.split(f)
499 if parts[0] not in data:
500 data[parts[0]] = []
502 data[parts[0]].append(read(os.path.join(start_folder, f)))
504 df = pd.DataFrame.from_dict(data, orient='index')
505 df = df.transpose()
507 return df
509 ret = {}
510 # traverse root directory, and list directories as dirs and files as files
511 print('Fallback: returning JSON dir structure')
512 start_idx = len(splitpath(start_folder))
513 for root, dirs, files in os.walk(start_folder):
514 path = splitpath(root)[start_idx-1:]
516 curr = {}
518 for file in files:
519 curr[file] = read(os.path.join(root, file))
521 currnode = ret
522 for part in path[:-1]:
523 currnode = currnode[part]
524 currnode[path[-1]] = curr
526 return ret
528def splitpath(path):
529 ret = re.split(r'[/\\]+', path)
530 filtered_array = [item for item in ret if item]
531 return filtered_array
533def read_gz_as_csv(gz_path):
534 # Open the .gz file
535 with gzip.open(gz_path, mode='rt') as file:
536 # Read the file as a CSV into a pandas DataFrame
537 df = pd.read_csv(file, comment='#', sep=' ')
538 return df
540def special_case_download(name):
541 if name == 'movie':
542 base_url = 'https://raw.githubusercontent.com/cernoch/movies/master/data/'
543 files = [
544 'actors.csv',
545 'casts.csv',
546 'remakes.csv',
547 'studios.csv',
548 'synonyms.csv',
549 'main.csv',
550 'people.csv'
551 ]
552 delim=None
553 elif name == 'diabetes':
554 base_url = 'https://raw.githubusercontent.com/kenneth-ge/UCI-Import/main/manually_cleaned/diabetes'
555 files = [
556 'diabetes_complete.tsv'
557 ]
558 delim='\t'
559 elif name == 'reuters':
560 base_url = 'https://raw.githubusercontent.com/kenneth-ge/UCI-Import/main/manually_cleaned/reuters'
561 files = [
562 'reuters_hayes_test.csv',
563 'reuters_hayes_train.csv',
564 'reuters_lewis_test.csv',
565 'reuters_lewis_train.csv',
566 'reuters_apte_test.csv',
567 'reuters_apte_train.csv',
568 ]
569 delim=None
570 elif name == 'eeg':
571 """ For this dataset we use fully custom code """
572 clear_extract_folder()
574 file_path = os.path.join(os.getcwd(), 'eeg+database.zip')
575 if not os.path.exists(file_path):
576 download_file('https://archive.ics.uci.edu/static/public/121/eeg+database.zip', file_path)
577 # print(f'File downloaded and saved as: {file_path}')
579 extract_path = os.path.join(os.getcwd(), 'extracted_files')
581 print('extracting...')
582 extract_archive(file_path, extract_path)
584 eeg_full_files = os.path.join(extract_path, 'eeg_full')
586 # List only files (excluding directories)
587 files = os.listdir(eeg_full_files)
589 ret = dict()
591 # Print the files
592 for file in files:
593 print('processing:', file)
594 resulting_path = os.path.join(extract_path, os.path.basename(file)[:-7])
595 name_list = extract_archive(os.path.join(eeg_full_files, file), extract_path)
596 extracted_files = [os.path.join(extract_path, file) for file in name_list]
598 ret[os.path.basename(file)] = dict()
599 for f in extracted_files:
600 if '.gz' not in f:
601 continue
602 #print(f, os.path.basename(file), os.path.basename(f))
603 try:
604 ret[os.path.basename(file)][os.path.basename(f)] = read_gz_as_csv(f)
605 except:
606 pass
608 return ret
611 ret = {}
613 for f in files:
614 if os.path.exists(f):
615 ret[f] = pd.read_csv(f, on_bad_lines='skip', delimiter=delim)
616 else:
617 path = os.path.join(base_url, f)
618 path = path.replace('\\', '/')
619 print('path:', path)
620 ret[f] = pd.read_csv(path, on_bad_lines='skip', delimiter=delim)
622 return ret
624import requests
625from requests.exceptions import Timeout
626import time
628def download_file(url, file_path, connect_timeout=10, read_timeout=99999):
629 try:
630 start_time = time.time()
631 # Use a session to handle the request
632 with requests.Session() as session:
633 response = session.get(url, stream=True, timeout=(connect_timeout, read_timeout))
634 response.raise_for_status() # Check for HTTP errors
636 # Write the content to a file
637 with open(file_path, 'wb') as file:
638 for chunk in response.iter_content(chunk_size=8192):
639 #print('elapsed:', time.time() - start_time)
640 if time.time() - start_time > read_timeout:
641 #print('raising exception')
642 raise Timeout()
643 if chunk: # filter out keep-alive new chunks
644 file.write(chunk)
646 except Exception as err:
647 print(err)
648 os.remove(file_path)
649 raise Exception("Download timed out or " + f"another error occurred: {err}")