Coverage for mall/polars.py: 94%

47 statements  

« prev     ^ index     » next       coverage.py v7.6.3, created at 2024-10-15 15:57 -0500

1import polars as pl 

2 

3from mall.prompt import ( 

4 sentiment, 

5 summarize, 

6 translate, 

7 classify, 

8 extract, 

9 custom, 

10 verify, 

11) 

12from mall.llm import map_call 

13 

14 

15@pl.api.register_dataframe_namespace("llm") 

16class MallFrame: 

17 """Extension to Polars that add ability to use 

18 an LLM to run batch predictions over a data frame 

19 

20 We will start by loading the needed libraries, and 

21 set up the data frame that will be used in the 

22 examples: 

23 

24 ```{python} 

25 #| output: false 

26 import mall 

27 import polars as pl 

28 pl.Config(fmt_str_lengths=100) 

29 pl.Config.set_tbl_hide_dataframe_shape(True) 

30 pl.Config.set_tbl_hide_column_data_types(True) 

31 data = mall.MallData 

32 reviews = data.reviews 

33 reviews.llm.use(options = dict(seed = 100)) 

34 ``` 

35 """ 

36 

37 def __init__(self, df: pl.DataFrame) -> None: 

38 self._df = df 

39 self._use = dict(backend="ollama", model="llama3.2", _cache="_mall_cache") 

40 

41 def use(self, backend="", model="", _cache="_mall_cache", **kwargs): 

42 """Define the model, backend, and other options to use to 

43 interact with the LLM. 

44 

45 Parameters 

46 ------ 

47 backend : str 

48 The name of the backend to use. At the beginning of the session 

49 it defaults to "ollama". If passing `""`, it will remain unchanged 

50 model : str 

51 The name of the model tha the backend should use. At the beginning 

52 of the session it defaults to "llama3.2". If passing `""`, it will 

53 remain unchanged 

54 _cache : str 

55 The path of where to save the cached results. Passing `""` disables 

56 the cache 

57 **kwargs 

58 Arguments to pass to the downstream Python call. In this case, the 

59 `chat` function in `ollama` 

60 

61 Examples 

62 ------ 

63 

64 ```{python} 

65 # Additional arguments will be passed 'as-is' to the 

66 # downstream R function in this example, to ollama::chat() 

67 reviews.llm.use("ollama", "llama3.2", seed = 100, temp = 0.1) 

68 ``` 

69 

70 ```{python} 

71 # During the Python session, you can change any argument 

72 # individually and it will retain all of previous 

73 # arguments used 

74 reviews.llm.use(temp = 0.3) 

75 ``` 

76 

77 ```{python} 

78 # Use _cache to modify the target folder for caching 

79 reviews.llm.use(_cache = "_my_cache") 

80 ``` 

81 

82 ```{python} 

83 # Leave _cache empty to turn off this functionality 

84 reviews.llm.use(_cache = "") 

85 ``` 

86 """ 

87 if backend != "": 

88 self._use.update(dict(backend=backend)) 

89 if model != "": 

90 self._use.update(dict(model=model)) 

91 self._use.update(dict(_cache=_cache)) 

92 self._use.update(dict(kwargs)) 

93 return self._use 

94 

95 def sentiment( 

96 self, 

97 col, 

98 options=["positive", "negative", "neutral"], 

99 additional="", 

100 pred_name="sentiment", 

101 ) -> list[pl.DataFrame]: 

102 """Use an LLM to run a sentiment analysis 

103 

104 Parameters 

105 ------ 

106 col : str 

107 The name of the text field to process 

108 

109 options : list or dict 

110 A list of the sentiment options to use, or a named DICT 

111 object 

112 

113 pred_name : str 

114 A character vector with the name of the new column where the 

115 prediction will be placed 

116 

117 additional : str 

118 Inserts this text into the prompt sent to the LLM 

119 

120 

121 Examples 

122 ------ 

123 

124 ```{python} 

125 reviews.llm.sentiment("review") 

126 ``` 

127 

128 ```{python} 

129 # Use 'pred_name' to customize the new column's name 

130 reviews.llm.sentiment("review", pred_name="review_sentiment") 

131 ``` 

132 

133 ```{python} 

134 # Pass custom sentiment options 

135 reviews.llm.sentiment("review", ["positive", "negative"]) 

136 ``` 

137 

138 ```{python} 

139 # Use a DICT object to specify values to return per sentiment 

140 reviews.llm.sentiment("review", {"positive" : 1, "negative" : 0}) 

141 ``` 

142 

143 """ 

144 df = map_call( 

145 df=self._df, 

146 col=col, 

147 msg=sentiment(options, additional=additional), 

148 pred_name=pred_name, 

149 use=self._use, 

150 valid_resps=options, 

151 ) 

152 return df 

153 

154 def summarize( 

155 self, 

156 col, 

157 max_words=10, 

158 additional="", 

159 pred_name="summary", 

160 ) -> list[pl.DataFrame]: 

161 """Summarize the text down to a specific number of words. 

162 

163 Parameters 

164 ------ 

165 col : str 

166 The name of the text field to process 

167 

168 max_words : int 

169 Maximum number of words to use for the summary 

170 

171 pred_name : str 

172 A character vector with the name of the new column where the 

173 prediction will be placed 

174 

175 additional : str 

176 Inserts this text into the prompt sent to the LLM 

177 

178 Examples 

179 ------ 

180 

181 ```{python} 

182 # Use max_words to set the maximum number of words to use for the summary 

183 reviews.llm.summarize("review", max_words = 5) 

184 ``` 

185 

186 ```{python} 

187 # Use 'pred_name' to customize the new column's name 

188 reviews.llm.summarize("review", 5, pred_name = "review_summary") 

189 ``` 

190 """ 

191 df = map_call( 

192 df=self._df, 

193 col=col, 

194 msg=summarize(max_words, additional=additional), 

195 pred_name=pred_name, 

196 use=self._use, 

197 ) 

198 return df 

199 

200 def translate( 

201 self, 

202 col, 

203 language="", 

204 additional="", 

205 pred_name="translation", 

206 ) -> list[pl.DataFrame]: 

207 """Translate text into another language. 

208 

209 Parameters 

210 ------ 

211 col : str 

212 The name of the text field to process 

213 

214 language : str 

215 The target language to translate to. For example 'French'. 

216 

217 pred_name : str 

218 A character vector with the name of the new column where the 

219 prediction will be placed 

220 

221 additional : str 

222 Inserts this text into the prompt sent to the LLM 

223 

224 

225 Examples 

226 ------ 

227 

228 ```{python} 

229 reviews.llm.translate("review", "spanish") 

230 ``` 

231 

232 ```{python} 

233 reviews.llm.translate("review", "french") 

234 ``` 

235 

236 """ 

237 df = map_call( 

238 df=self._df, 

239 col=col, 

240 msg=translate(language, additional=additional), 

241 pred_name=pred_name, 

242 use=self._use, 

243 ) 

244 return df 

245 

246 def classify( 

247 self, 

248 col, 

249 labels="", 

250 additional="", 

251 pred_name="classify", 

252 ) -> list[pl.DataFrame]: 

253 """Classify text into specific categories. 

254 

255 Parameters 

256 ------ 

257 col : str 

258 The name of the text field to process 

259 

260 labels : list 

261 A list or a DICT object that defines the categories to 

262 classify the text as. It will return one of the provided 

263 labels. 

264 

265 pred_name : str 

266 A character vector with the name of the new column where the 

267 prediction will be placed 

268 

269 additional : str 

270 Inserts this text into the prompt sent to the LLM 

271 

272 Examples 

273 ------ 

274 

275 ```{python} 

276 reviews.llm.classify("review", ["appliance", "computer"]) 

277 ``` 

278 

279 ```{python} 

280 # Use 'pred_name' to customize the new column's name 

281 reviews.llm.classify("review", ["appliance", "computer"], pred_name="prod_type") 

282 ``` 

283 

284 ```{python} 

285 #Pass a DICT to set custom values for each classification 

286 reviews.llm.classify("review", {"appliance" : "1", "computer" : "2"}) 

287 ``` 

288 """ 

289 df = map_call( 

290 df=self._df, 

291 col=col, 

292 msg=classify(labels, additional=additional), 

293 pred_name=pred_name, 

294 use=self._use, 

295 valid_resps=labels, 

296 ) 

297 return df 

298 

299 def extract( 

300 self, 

301 col, 

302 labels="", 

303 expand_cols=False, 

304 additional="", 

305 pred_name="extract", 

306 ) -> list[pl.DataFrame]: 

307 """Pull a specific label from the text. 

308 

309 Parameters 

310 ------ 

311 col : str 

312 The name of the text field to process 

313 

314 labels : list 

315 A list or a DICT object that defines tells the LLM what 

316 to look for and return 

317 

318 pred_name : str 

319 A character vector with the name of the new column where the 

320 prediction will be placed 

321 

322 additional : str 

323 Inserts this text into the prompt sent to the LLM 

324 

325 Examples 

326 ------ 

327 

328 ```{python} 

329 # Use 'labels' to let the function know what to extract 

330 reviews.llm.extract("review", labels = "product") 

331 ``` 

332 

333 ```{python} 

334 # Use 'pred_name' to customize the new column's name 

335 reviews.llm.extract("review", "product", pred_name = "prod") 

336 ``` 

337 

338 ```{python} 

339 # Pass a vector to request multiple things, the results will be pipe delimeted 

340 # in a single column 

341 reviews.llm.extract("review", ["product", "feelings"]) 

342 ``` 

343 

344 ```{python} 

345 # Set 'expand_cols' to True to split multiple lables 

346 # into individual columns 

347 reviews.llm.extract( 

348 col="review", 

349 labels=["product", "feelings"], 

350 expand_cols=True 

351 ) 

352 ``` 

353 

354 ```{python} 

355 # Set custom names to the resulting columns 

356 reviews.llm.extract( 

357 col="review", 

358 labels={"prod": "product", "feels": "feelings"}, 

359 expand_cols=True 

360 ) 

361 ``` 

362 

363 """ 

364 

365 lab_names = labels 

366 lab_vals = labels 

367 if isinstance(labels, dict): 

368 lab_names = [] 

369 lab_vals = [] 

370 for label in labels: 

371 lab_names.append(label) 

372 lab_vals.append(labels[label]) 

373 df = map_call( 

374 df=self._df, 

375 col=col, 

376 msg=extract(lab_vals, additional=additional), 

377 pred_name=pred_name, 

378 use=self._use, 

379 ) 

380 if expand_cols: 

381 df = df.with_columns( 

382 pl.col("extract") 

383 .str.split_exact(n=len(labels) - 1, by="|") 

384 .struct.rename_fields(lab_names) 

385 ).unnest("extract") 

386 

387 return df 

388 

389 def custom( 

390 self, 

391 col, 

392 prompt="", 

393 valid_resps="", 

394 pred_name="custom", 

395 ) -> list[pl.DataFrame]: 

396 """Provide the full prompt that the LLM will process. 

397 

398 Parameters 

399 ------ 

400 col : str 

401 The name of the text field to process 

402 

403 prompt : str 

404 The prompt to send to the LLM along with the `col` 

405 

406 pred_name : str 

407 A character vector with the name of the new column where the 

408 prediction will be placed 

409 

410 

411 Examples 

412 ------ 

413 

414 ```{python} 

415 my_prompt = ( 

416 "Answer a question." 

417 "Return only the answer, no explanation" 

418 "Acceptable answers are 'yes', 'no'" 

419 "Answer this about the following text, is this a happy customer?:" 

420 ) 

421 

422 reviews.llm.custom("review", prompt = my_prompt) 

423 ``` 

424 """ 

425 df = map_call( 

426 df=self._df, 

427 col=col, 

428 msg=custom(prompt), 

429 pred_name=pred_name, 

430 use=self._use, 

431 valid_resps=valid_resps, 

432 ) 

433 return df 

434 

435 def verify( 

436 self, 

437 col, 

438 what="", 

439 yes_no=[1, 0], 

440 additional="", 

441 pred_name="verify", 

442 ) -> list[pl.DataFrame]: 

443 """Check to see if something is true about the text. 

444 

445 Parameters 

446 ------ 

447 col : str 

448 The name of the text field to process 

449 

450 what : str 

451 The statement or question that needs to be verified against the 

452 provided text 

453 

454 yes_no : list 

455 A positional list of size 2, which contains the values to return 

456 if true and false. The first position will be used as the 'true' 

457 value, and the second as the 'false' value 

458 

459 pred_name : str 

460 A character vector with the name of the new column where the 

461 prediction will be placed 

462 

463 additional : str 

464 Inserts this text into the prompt sent to the LLM 

465 

466 Examples 

467 ------ 

468 

469 ```{python} 

470 reviews.llm.verify("review", "is the customer happy") 

471 ``` 

472 

473 ```{python} 

474 # Use 'yes_no' to modify the 'true' and 'false' values to return 

475 reviews.llm.verify("review", "is the customer happy", ["y", "n"]) 

476 ``` 

477 """ 

478 df = map_call( 

479 df=self._df, 

480 col=col, 

481 msg=verify(what, additional=additional), 

482 pred_name=pred_name, 

483 use=self._use, 

484 valid_resps=yes_no, 

485 convert=dict(yes=yes_no[0], no=yes_no[1]), 

486 ) 

487 return df