Coverage for base-mlguessimporter\src\mlguessimporter\import_fns.py: 98%

376 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-09-18 23:30 -0700

1import json 

2import pandas as pd 

3from typing import Optional 

4import urllib.request 

5import urllib.parse 

6import certifi 

7import ssl 

8 

9from ucimlrepo.dotdict import dotdict 

10from ucimlrepo.fetch import DatasetNotFoundError 

11 

12import traceback 

13 

14# API endpoints 

15API_BASE_URL = 'https://archive.ics.uci.edu/api/dataset' 

16API_LIST_URL = 'https://archive.ics.uci.edu/api/datasets/list' 

17 

18# base location of data csv files 

19DATASET_FILE_BASE_URL = 'https://archive.ics.uci.edu/static/public' 

20 

21# available categories of datasets to filter by  

22VALID_FILTERS = ['aim-ahead'] 

23 

24import re 

25 

26def fetch_ucirepo( 

27 name: Optional[str] = None, 

28 id: Optional[int] = None 

29 ): 

30 ''' 

31 Loads a dataset from the UCI ML Repository, including the dataframes and metadata information. 

32 

33 Parameters:  

34 id (int): Dataset ID for UCI ML Repository 

35 name (str): Dataset name, or substring of name 

36 (Only provide id or name, not both) 

37 

38 Returns: 

39 result (dotdict): object containing dataset metadata, dataframes, and variable info in its properties 

40 ''' 

41 

42 # check that only one argument is provided 

43 if name and id: 

44 raise ValueError('Only specify either dataset name or ID, not both') 

45 

46 # validate types of arguments and add them to the endpoint query string 

47 api_url = API_BASE_URL 

48 if name: 

49 if type(name) != str: 

50 raise ValueError('Name must be a string') 

51 api_url += '?name=' + urllib.parse.quote(name) 

52 elif id: 

53 if type(id) != int: 

54 raise ValueError('ID must be an integer') 

55 api_url += '?id=' + str(id) 

56 else: 

57 # no arguments provided 

58 raise ValueError('Must provide a dataset name or ID') 

59 

60 

61 # fetch metadata from API 

62 data = None 

63 #try: 

64 response = urllib.request.urlopen(api_url, context=ssl.create_default_context(cafile=certifi.where())) 

65 data = json.load(response) 

66 #except (urllib.error.URLError, urllib.error.HTTPError): 

67 # raise ConnectionError('Error connecting to server') 

68 

69 # verify that dataset exists  

70 if data['status'] != 200: 

71 error_msg = data['message'] if 'message' in data else 'Dataset not found in repository' 

72 raise DatasetNotFoundError(error_msg) 

73 

74 

75 # extract ID, name, and URL from metadata 

76 metadata = data['data'] 

77 if not id: 

78 id = metadata['uci_id'] 

79 elif not name: 

80 name = metadata['name'] 

81 

82 data_url = metadata['data_url'] 

83 

84 # no data URL means that the dataset cannot be imported into Python 

85 # i.e. it does not yet have a standardized CSV file for pandas to parse 

86 if not data_url: 

87 

88 if id==132 or name=='movie': 

89 return ('custom', special_case_download('movie')) 

90 elif id==34 or name=='diabetes': 

91 return ('custom', special_case_download('diabetes')) 

92 elif id==137 or name=='reuters21578' or name=='reuters+21578+text+categorization+collection': 

93 return ('custom', special_case_download('reuters')) 

94 elif id==121 or name=='eeg+database': 

95 return ('custom', special_case_download('eeg')) 

96 

97 data = get_data(id, name) 

98 

99 if data is None: 

100 raise DatasetNotFoundError('"{}" dataset (id={}) exists in the repository, but is not available for import. Please select a dataset from this list: https://archive.ics.uci.edu/datasets?skip=0&take=10&sort=desc&orderBy=NumHits&search=&Python=true'.format(name, id)) 

101 

102 return ('custom', data) 

103 

104 # parse into dataframe using pandas 

105 df = None 

106 #try: 

107 df = pd.read_csv(data_url) 

108 #except (urllib.error.URLError, urllib.error.HTTPError): 

109 # raise DatasetNotFoundError('Error reading data csv file for "{}" dataset (id={}).'.format(name, id)) 

110 #except Exception as e: 

111 # df = pd.read_excel(data_url, engine='openpyxl') 

112 

113 #if df.empty: 

114 # raise DatasetNotFoundError('Error reading data csv file for "{}" dataset (id={}).'.format(name, id)) 

115 

116 

117 # header line should be variable names 

118 headers = df.columns 

119 

120 # feature information, class labels 

121 variables = metadata['variables'] 

122 del metadata['variables'] # moved from metadata to a separate property 

123 

124 # organize variables into IDs, features, or targets 

125 variables_by_role = { 

126 'ID': [], 

127 'Feature': [], 

128 'Target': [], 

129 'Other': [] 

130 } 

131 for variable in variables: 

132 if variable['role'] not in variables_by_role: 

133 raise ValueError('Role must be one of "ID", "Feature", or "Target", or "Other"') 

134 variables_by_role[variable['role']].append(variable['name']) 

135 

136 # extract dataframes for each variable role 

137 ids_df = df[variables_by_role['ID']] if len(variables_by_role['ID']) > 0 else None 

138 features_df = df[variables_by_role['Feature']] if len(variables_by_role['Feature']) > 0 else None 

139 targets_df = df[variables_by_role['Target']] if len(variables_by_role['Target']) > 0 else None 

140 

141 # place all varieties of dataframes in data object 

142 data = { 

143 'ids': ids_df, 

144 'features': features_df, 

145 'targets': targets_df, 

146 'original': df, 

147 'headers': headers, 

148 } 

149 

150 # convert variables from JSON structure to tabular structure for easier visualization 

151 variables = pd.DataFrame.from_records(variables) 

152 

153 # alternative usage?:  

154 # variables.age.role or variables.slope.description 

155 # print(variables) -> json-like dict with keys [name] -> details 

156 

157 # make nested metadata fields accessible via dot notation 

158 metadata['additional_info'] = dotdict(metadata['additional_info']) if metadata['additional_info'] else None 

159 metadata['intro_paper'] = dotdict(metadata['intro_paper']) if metadata['intro_paper'] else None 

160 

161 # construct result object 

162 result = { 

163 'data': dotdict(data), 

164 'metadata': dotdict(metadata), 

165 'variables': variables 

166 } 

167 

168 # convert to dictionary with dot notation 

169 return ('uci', dotdict(result)) 

170 

171import requests 

172from bs4 import BeautifulSoup 

173import os 

174from urllib.parse import urljoin 

175import warnings 

176 

177def get_data(id, name): 

178 # URL of the website you want to scrape 

179 url = f'https://archive.ics.uci.edu/dataset/{id}/{name}' 

180 

181 # Send a GET request to the website 

182 response = requests.get(url) 

183 

184 # Check if the request was successful 

185 if response.status_code == 200: 

186 # Parse the HTML content using BeautifulSoup 

187 soup = BeautifulSoup(response.content, 'html.parser') 

188 

189 # Find the anchor tag with the text "Download" 

190 download_link = soup.find_all('a') 

191 download_link = list(filter(lambda tag: tag.find('span', string='Download'), download_link)) 

192 

193 if download_link: 

194 download_link = download_link[0] 

195 # Get the href attribute (the URL the link points to) 

196 download_url = download_link['href'] 

197 print('Download link found:', download_url) 

198 full_download_url = urljoin(url, download_url) 

199 print('Download link:', full_download_url) 

200 

201 file_name = os.path.basename(full_download_url) 

202 file_path = os.path.join(os.getcwd(), file_name) 

203 

204 clear_extract_folder() 

205 

206 if os.path.exists(file_path): 

207 warnings.warn('Using locally downloaded archive, since downloading can be very slow...') 

208 return extract_and_get_path(file_path) 

209 

210 # Send a GET request to the download URL 

211 download_file(full_download_url, file_path) 

212 

213 print(f'File downloaded and saved as: {file_path}') 

214 

215 return extract_and_get_path(file_path) 

216 else: 

217 print('No download link found.') 

218 else: 

219 print(f'Failed to retrieve the webpage. Status code: {response.status_code}') 

220 

221def clear_extract_folder(): 

222 import os, shutil 

223 

224 folder = os.path.join(os.getcwd(), 'extracted_files') 

225 

226 if not os.path.exists(folder): 

227 os.makedirs(folder) 

228 return 

229 

230 for root, dirs, files in os.walk(folder): 

231 try: 

232 for file in files: 

233 file_path = os.path.join(root, file) 

234 os.chmod(file_path, 0o777) # Ensure the file is writable 

235 os.remove(file_path) # Remove the file 

236 

237 for dir in dirs: 

238 dir_path = os.path.join(root, dir) 

239 os.chmod(dir_path, 0o777) # Ensure the directory is writable 

240 shutil.rmtree(dir_path) # Remove the directory 

241 except Exception as e: 

242 print('Failed to delete %s. Reason: %s' % (file_path, e)) 

243 

244import zipfile 

245import tarfile 

246import gzip 

247 

248def extract_and_get_path(file_path): 

249 """ 

250 Recursively extract nested archives until we get a file 

251 """ 

252 extract_path = os.path.join(os.getcwd(), 'extracted_files') 

253 

254 # Extract the archive 

255 name_list = extract_archive(file_path, extract_path) 

256 # print('name list:', name_list) 

257 extracted_files = [os.path.join(extract_path, file) for file in name_list] 

258 

259 #print('Extracted files:', extracted_files) 

260 

261 archives = [] 

262 text_files = [] 

263 extensionless_files = [] 

264 table_files = [] 

265 

266 for file in extracted_files: 

267 #print(f'Extracted file: {file}') 

268 if file.endswith('.data'): 

269 print(f'Returning {file}') 

270 table_files.append(file) 

271 #return file, False 

272 if file.endswith('.xlsx'): 

273 print(f'Returning {file}') 

274 table_files.append(file) 

275 #return file, False 

276 if file.endswith('.csv'): 

277 print(f'Returning {file}') 

278 table_files.append(file) 

279 #return file, True 

280 if is_archive(file): 

281 archives.append(file) 

282 if '.txt' in file: 

283 text_files.append(file) 

284 if '.' not in file: 

285 extensionless_files.append(file) 

286 

287 if len(table_files) > 0: 

288 ret = {} 

289 

290 for f in table_files: 

291 basename = os.path.basename(f) 

292 try: 

293 df = read_csv(f) 

294 except Exception as e: 

295 df = pd.read_excel(f, engine='openpyxl') 

296 

297 ret[basename] = df 

298 

299 return ret 

300 

301 

302 print('Did not find any tabular/CSV data. Trying archives:', archives) 

303 

304 ret = None 

305 

306 import editdistance 

307 from functools import cmp_to_key 

308 

309 def dist(item): 

310 return min(editdistance.eval(item, 'data'), editdistance.eval(item, file_path)) 

311 

312 def compare(item1, item2): 

313 return dist(item1) - dist(item2) 

314 

315 sorted(archives, key=cmp_to_key(compare)) 

316 

317 for a in archives: 

318 print('trying this archive:', a) 

319 res = extract_and_get_path(a) 

320 if res is not None: 

321 return res 

322 else: 

323 if ret is None: 

324 ret = [] 

325 print('trying JSON') 

326 res = try_json_extraction(a) 

327 if res is not None: 

328 ret.append(res) 

329 

330 # try any txt files as CSV file 

331 print('trying txt files') 

332 delimiters = [None, ',', ';', '\t'] 

333 backup = {} 

334 nasum = 1 

335 for t in text_files: 

336 for d in delimiters: 

337 try: 

338 

339 df = read_csv(t, header=None, delimiter=d) 

340 

341 # print('imported successfully,', df.shape) 

342 

343 this_nasum = df.isna().sum().sum() / df.size 

344 if df.shape[1] == 1 or this_nasum > 0: 

345 # print('only one column, save as backup') 

346 if this_nasum < nasum: 

347 backup = {os.path.basename(t): df} 

348 nasum = this_nasum 

349 elif abs(this_nasum - nasum) < 0.01: 

350 backup[os.path.basename(t)] = df 

351 else: 

352 # print('actually dont save due to nans (', this_nasum, '>', nasum, ')') 

353 pass 

354 continue 

355 

356 return df 

357 except Exception as e: 

358 #print(e) 

359 pass 

360 

361 print('trying files with no extension') 

362 for t in extensionless_files: 

363 for d in delimiters: 

364 try: 

365 df = read_csv(t, header=None, delimiter=d) 

366 

367 #print('imported successfully,', df.shape) 

368 

369 this_nasum = df.isna().sum().sum() / df.size 

370 if df.shape[1] == 1 or this_nasum > 0: 

371 #print('only one column, save as backup') 

372 if this_nasum < nasum: 

373 backup = {os.path.basename(t): df} 

374 nasum = this_nasum 

375 else: 

376 #print('actually dont save due to nans (', this_nasum, '>', nasum, ')') 

377 pass 

378 continue 

379 

380 return df 

381 except Exception as e: 

382 pass 

383 

384 # print('this is our backup', backup) 

385 # print('backup =?= {}', backup == {}) 

386 

387 if (not backup == {}) and (backup is not None): 

388 print('returning backup') 

389 return backup 

390 

391 return ret 

392 

393def extract_archive(file_path, extract_to): 

394 """Extract various types of archive files.""" 

395 if file_path.endswith('.zip'): 

396 with zipfile.ZipFile(file_path, 'r') as zip_ref: 

397 zip_ref.extractall(extract_to) 

398 return zip_ref.namelist() 

399 elif file_path.endswith('.tar') or file_path.endswith('.tar.gz') or file_path.endswith('.tgz'): 

400 tar = tarfile.open(file_path) 

401 tar.extractall(path=extract_to)#, filter='data') 

402 retlist = [t.name for t in tar.getmembers()] 

403 #print(retlist) 

404 tar.close() 

405 return retlist 

406 elif file_path.endswith('.gz'): 

407 print('handle .gz files') 

408 # .gz files usually contain a single file, so extract that 

409 with gzip.open(file_path, 'rb') as gz_ref: 

410 file_content = gz_ref.read() 

411 output_file_path = os.path.join(extract_to, os.path.basename(file_path).replace('.gz', '')) 

412 with open(output_file_path, 'wb') as out_file: 

413 out_file.write(file_content) 

414 os.chmod(output_file_path, 0o755) 

415 return [output_file_path] 

416 elif file_path.endswith('.tar.Z'): 

417 import unlzw3 

418 from pathlib import Path 

419 uncompressed_data = unlzw3.unlzw(Path(file_path).read_bytes()) 

420 inner_file = file_path[:-2] 

421 with open(inner_file, 'wb') as file: 

422 file.write(uncompressed_data) 

423 

424 if is_archive(inner_file): 

425 return extract_archive(inner_file, extract_to) 

426 else: 

427 return [inner_file] 

428 else: 

429 raise ValueError("Unsupported file format:", file_path) 

430 

431def is_archive(file_path): 

432 archive_extensions = ['.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.z', '.tar.z'] 

433 #print(file_path, any(file_path.lower().endswith(ext) for ext in archive_extensions)) 

434 return any(file_path.lower().endswith(ext) for ext in archive_extensions) 

435 

436def read_csv(file, header=None, delimiter=None): 

437 largest_col_cnt = 0 

438 

439 with open(file, 'r') as temp_f: 

440 lines = temp_f.readlines() 

441 

442 for l in lines: 

443 col_cnt = len(l.split(delimiter)) + 1 

444 

445 largest_col_cnt = max(largest_col_cnt, col_cnt) 

446 

447 col_names = [i for i in range(0, largest_col_cnt)] 

448 

449 return pd.read_csv(file, header=None, delimiter=delimiter, names=col_names) 

450 

451def try_json_extraction(archive, name_list=None): 

452 """ 

453 Try to import dataset as a JSON, where each file  

454 """ 

455 

456 extract_path = os.path.join(os.getcwd(), 'extracted_files') 

457 

458 # Extract the archive 

459 if name_list is None: 

460 name_list = extract_archive(archive, extract_path) 

461 

462 extracted_files = [os.path.join(extract_path, file) for file in name_list] 

463 

464 print('Attempting CSV import...') 

465 

466 from pathlib import Path 

467 

468 start_folder = os.path.commonprefix(extracted_files) 

469 extracted_files = [s[len(start_folder):] for s in extracted_files] 

470 

471 max_depth = 0 

472 

473 for e in extracted_files: 

474 depth = len(Path(e).parents) 

475 

476 max_depth = max(max_depth, depth) 

477 

478 def read(file): 

479 try: 

480 with open(file, 'r') as f: 

481 # Read the entire content of the file 

482 content = f.read() 

483 

484 return content 

485 except: 

486 #print('had to skip', file) 

487 return "" 

488 

489 if max_depth <= 2: 

490 data = dict() 

491 

492 for f in extracted_files: 

493 if f == '': 

494 continue 

495 f = f[1:] 

496 

497 parts = os.path.split(f) 

498 

499 if parts[0] not in data: 

500 data[parts[0]] = [] 

501 

502 data[parts[0]].append(read(os.path.join(start_folder, f))) 

503 

504 df = pd.DataFrame.from_dict(data, orient='index') 

505 df = df.transpose() 

506 

507 return df 

508 

509 ret = {} 

510 # traverse root directory, and list directories as dirs and files as files 

511 print('Fallback: returning JSON dir structure') 

512 start_idx = len(splitpath(start_folder)) 

513 for root, dirs, files in os.walk(start_folder): 

514 path = splitpath(root)[start_idx-1:] 

515 

516 curr = {} 

517 

518 for file in files: 

519 curr[file] = read(os.path.join(root, file)) 

520 

521 currnode = ret 

522 for part in path[:-1]: 

523 currnode = currnode[part] 

524 currnode[path[-1]] = curr 

525 

526 return ret 

527 

528def splitpath(path): 

529 ret = re.split(r'[/\\]+', path) 

530 filtered_array = [item for item in ret if item] 

531 return filtered_array 

532 

533def read_gz_as_csv(gz_path): 

534 # Open the .gz file 

535 with gzip.open(gz_path, mode='rt') as file: 

536 # Read the file as a CSV into a pandas DataFrame 

537 df = pd.read_csv(file, comment='#', sep=' ') 

538 return df 

539 

540def special_case_download(name): 

541 if name == 'movie': 

542 base_url = 'https://raw.githubusercontent.com/cernoch/movies/master/data/' 

543 files = [ 

544 'actors.csv', 

545 'casts.csv', 

546 'remakes.csv', 

547 'studios.csv', 

548 'synonyms.csv', 

549 'main.csv', 

550 'people.csv' 

551 ] 

552 delim=None 

553 elif name == 'diabetes': 

554 base_url = 'https://raw.githubusercontent.com/kenneth-ge/UCI-Import/main/manually_cleaned/diabetes' 

555 files = [ 

556 'diabetes_complete.tsv' 

557 ] 

558 delim='\t' 

559 elif name == 'reuters': 

560 base_url = 'https://raw.githubusercontent.com/kenneth-ge/UCI-Import/main/manually_cleaned/reuters' 

561 files = [ 

562 'reuters_hayes_test.csv', 

563 'reuters_hayes_train.csv', 

564 'reuters_lewis_test.csv', 

565 'reuters_lewis_train.csv', 

566 'reuters_apte_test.csv', 

567 'reuters_apte_train.csv', 

568 ] 

569 delim=None 

570 elif name == 'eeg': 

571 """ For this dataset we use fully custom code """ 

572 clear_extract_folder() 

573 

574 file_path = os.path.join(os.getcwd(), 'eeg+database.zip') 

575 if not os.path.exists(file_path): 

576 download_file('https://archive.ics.uci.edu/static/public/121/eeg+database.zip', file_path) 

577 # print(f'File downloaded and saved as: {file_path}') 

578 

579 extract_path = os.path.join(os.getcwd(), 'extracted_files') 

580 

581 print('extracting...') 

582 extract_archive(file_path, extract_path) 

583 

584 eeg_full_files = os.path.join(extract_path, 'eeg_full') 

585 

586 # List only files (excluding directories) 

587 files = os.listdir(eeg_full_files) 

588 

589 ret = dict() 

590 

591 # Print the files 

592 for file in files: 

593 print('processing:', file) 

594 resulting_path = os.path.join(extract_path, os.path.basename(file)[:-7]) 

595 name_list = extract_archive(os.path.join(eeg_full_files, file), extract_path) 

596 extracted_files = [os.path.join(extract_path, file) for file in name_list] 

597 

598 ret[os.path.basename(file)] = dict() 

599 for f in extracted_files: 

600 if '.gz' not in f: 

601 continue 

602 #print(f, os.path.basename(file), os.path.basename(f)) 

603 try: 

604 ret[os.path.basename(file)][os.path.basename(f)] = read_gz_as_csv(f) 

605 except: 

606 pass 

607 

608 return ret 

609 

610 

611 ret = {} 

612 

613 for f in files: 

614 if os.path.exists(f): 

615 ret[f] = pd.read_csv(f, on_bad_lines='skip', delimiter=delim) 

616 else: 

617 path = os.path.join(base_url, f) 

618 path = path.replace('\\', '/') 

619 print('path:', path) 

620 ret[f] = pd.read_csv(path, on_bad_lines='skip', delimiter=delim) 

621 

622 return ret 

623 

624import requests 

625from requests.exceptions import Timeout 

626import time 

627 

628def download_file(url, file_path, connect_timeout=10, read_timeout=99999): 

629 try: 

630 start_time = time.time() 

631 # Use a session to handle the request 

632 with requests.Session() as session: 

633 response = session.get(url, stream=True, timeout=(connect_timeout, read_timeout)) 

634 response.raise_for_status() # Check for HTTP errors 

635 

636 # Write the content to a file 

637 with open(file_path, 'wb') as file: 

638 for chunk in response.iter_content(chunk_size=8192): 

639 #print('elapsed:', time.time() - start_time) 

640 if time.time() - start_time > read_timeout: 

641 #print('raising exception') 

642 raise Timeout() 

643 if chunk: # filter out keep-alive new chunks 

644 file.write(chunk) 

645 

646 except Exception as err: 

647 print(err) 

648 os.remove(file_path) 

649 raise Exception("Download timed out or " + f"another error occurred: {err}")