Coverage for mall/polars.py: 94%
47 statements
« prev ^ index » next coverage.py v7.6.3, created at 2024-10-15 15:57 -0500
« prev ^ index » next coverage.py v7.6.3, created at 2024-10-15 15:57 -0500
1import polars as pl
3from mall.prompt import (
4 sentiment,
5 summarize,
6 translate,
7 classify,
8 extract,
9 custom,
10 verify,
11)
12from mall.llm import map_call
15@pl.api.register_dataframe_namespace("llm")
16class MallFrame:
17 """Extension to Polars that add ability to use
18 an LLM to run batch predictions over a data frame
20 We will start by loading the needed libraries, and
21 set up the data frame that will be used in the
22 examples:
24 ```{python}
25 #| output: false
26 import mall
27 import polars as pl
28 pl.Config(fmt_str_lengths=100)
29 pl.Config.set_tbl_hide_dataframe_shape(True)
30 pl.Config.set_tbl_hide_column_data_types(True)
31 data = mall.MallData
32 reviews = data.reviews
33 reviews.llm.use(options = dict(seed = 100))
34 ```
35 """
37 def __init__(self, df: pl.DataFrame) -> None:
38 self._df = df
39 self._use = dict(backend="ollama", model="llama3.2", _cache="_mall_cache")
41 def use(self, backend="", model="", _cache="_mall_cache", **kwargs):
42 """Define the model, backend, and other options to use to
43 interact with the LLM.
45 Parameters
46 ------
47 backend : str
48 The name of the backend to use. At the beginning of the session
49 it defaults to "ollama". If passing `""`, it will remain unchanged
50 model : str
51 The name of the model tha the backend should use. At the beginning
52 of the session it defaults to "llama3.2". If passing `""`, it will
53 remain unchanged
54 _cache : str
55 The path of where to save the cached results. Passing `""` disables
56 the cache
57 **kwargs
58 Arguments to pass to the downstream Python call. In this case, the
59 `chat` function in `ollama`
61 Examples
62 ------
64 ```{python}
65 # Additional arguments will be passed 'as-is' to the
66 # downstream R function in this example, to ollama::chat()
67 reviews.llm.use("ollama", "llama3.2", seed = 100, temp = 0.1)
68 ```
70 ```{python}
71 # During the Python session, you can change any argument
72 # individually and it will retain all of previous
73 # arguments used
74 reviews.llm.use(temp = 0.3)
75 ```
77 ```{python}
78 # Use _cache to modify the target folder for caching
79 reviews.llm.use(_cache = "_my_cache")
80 ```
82 ```{python}
83 # Leave _cache empty to turn off this functionality
84 reviews.llm.use(_cache = "")
85 ```
86 """
87 if backend != "":
88 self._use.update(dict(backend=backend))
89 if model != "":
90 self._use.update(dict(model=model))
91 self._use.update(dict(_cache=_cache))
92 self._use.update(dict(kwargs))
93 return self._use
95 def sentiment(
96 self,
97 col,
98 options=["positive", "negative", "neutral"],
99 additional="",
100 pred_name="sentiment",
101 ) -> list[pl.DataFrame]:
102 """Use an LLM to run a sentiment analysis
104 Parameters
105 ------
106 col : str
107 The name of the text field to process
109 options : list or dict
110 A list of the sentiment options to use, or a named DICT
111 object
113 pred_name : str
114 A character vector with the name of the new column where the
115 prediction will be placed
117 additional : str
118 Inserts this text into the prompt sent to the LLM
121 Examples
122 ------
124 ```{python}
125 reviews.llm.sentiment("review")
126 ```
128 ```{python}
129 # Use 'pred_name' to customize the new column's name
130 reviews.llm.sentiment("review", pred_name="review_sentiment")
131 ```
133 ```{python}
134 # Pass custom sentiment options
135 reviews.llm.sentiment("review", ["positive", "negative"])
136 ```
138 ```{python}
139 # Use a DICT object to specify values to return per sentiment
140 reviews.llm.sentiment("review", {"positive" : 1, "negative" : 0})
141 ```
143 """
144 df = map_call(
145 df=self._df,
146 col=col,
147 msg=sentiment(options, additional=additional),
148 pred_name=pred_name,
149 use=self._use,
150 valid_resps=options,
151 )
152 return df
154 def summarize(
155 self,
156 col,
157 max_words=10,
158 additional="",
159 pred_name="summary",
160 ) -> list[pl.DataFrame]:
161 """Summarize the text down to a specific number of words.
163 Parameters
164 ------
165 col : str
166 The name of the text field to process
168 max_words : int
169 Maximum number of words to use for the summary
171 pred_name : str
172 A character vector with the name of the new column where the
173 prediction will be placed
175 additional : str
176 Inserts this text into the prompt sent to the LLM
178 Examples
179 ------
181 ```{python}
182 # Use max_words to set the maximum number of words to use for the summary
183 reviews.llm.summarize("review", max_words = 5)
184 ```
186 ```{python}
187 # Use 'pred_name' to customize the new column's name
188 reviews.llm.summarize("review", 5, pred_name = "review_summary")
189 ```
190 """
191 df = map_call(
192 df=self._df,
193 col=col,
194 msg=summarize(max_words, additional=additional),
195 pred_name=pred_name,
196 use=self._use,
197 )
198 return df
200 def translate(
201 self,
202 col,
203 language="",
204 additional="",
205 pred_name="translation",
206 ) -> list[pl.DataFrame]:
207 """Translate text into another language.
209 Parameters
210 ------
211 col : str
212 The name of the text field to process
214 language : str
215 The target language to translate to. For example 'French'.
217 pred_name : str
218 A character vector with the name of the new column where the
219 prediction will be placed
221 additional : str
222 Inserts this text into the prompt sent to the LLM
225 Examples
226 ------
228 ```{python}
229 reviews.llm.translate("review", "spanish")
230 ```
232 ```{python}
233 reviews.llm.translate("review", "french")
234 ```
236 """
237 df = map_call(
238 df=self._df,
239 col=col,
240 msg=translate(language, additional=additional),
241 pred_name=pred_name,
242 use=self._use,
243 )
244 return df
246 def classify(
247 self,
248 col,
249 labels="",
250 additional="",
251 pred_name="classify",
252 ) -> list[pl.DataFrame]:
253 """Classify text into specific categories.
255 Parameters
256 ------
257 col : str
258 The name of the text field to process
260 labels : list
261 A list or a DICT object that defines the categories to
262 classify the text as. It will return one of the provided
263 labels.
265 pred_name : str
266 A character vector with the name of the new column where the
267 prediction will be placed
269 additional : str
270 Inserts this text into the prompt sent to the LLM
272 Examples
273 ------
275 ```{python}
276 reviews.llm.classify("review", ["appliance", "computer"])
277 ```
279 ```{python}
280 # Use 'pred_name' to customize the new column's name
281 reviews.llm.classify("review", ["appliance", "computer"], pred_name="prod_type")
282 ```
284 ```{python}
285 #Pass a DICT to set custom values for each classification
286 reviews.llm.classify("review", {"appliance" : "1", "computer" : "2"})
287 ```
288 """
289 df = map_call(
290 df=self._df,
291 col=col,
292 msg=classify(labels, additional=additional),
293 pred_name=pred_name,
294 use=self._use,
295 valid_resps=labels,
296 )
297 return df
299 def extract(
300 self,
301 col,
302 labels="",
303 expand_cols=False,
304 additional="",
305 pred_name="extract",
306 ) -> list[pl.DataFrame]:
307 """Pull a specific label from the text.
309 Parameters
310 ------
311 col : str
312 The name of the text field to process
314 labels : list
315 A list or a DICT object that defines tells the LLM what
316 to look for and return
318 pred_name : str
319 A character vector with the name of the new column where the
320 prediction will be placed
322 additional : str
323 Inserts this text into the prompt sent to the LLM
325 Examples
326 ------
328 ```{python}
329 # Use 'labels' to let the function know what to extract
330 reviews.llm.extract("review", labels = "product")
331 ```
333 ```{python}
334 # Use 'pred_name' to customize the new column's name
335 reviews.llm.extract("review", "product", pred_name = "prod")
336 ```
338 ```{python}
339 # Pass a vector to request multiple things, the results will be pipe delimeted
340 # in a single column
341 reviews.llm.extract("review", ["product", "feelings"])
342 ```
344 ```{python}
345 # Set 'expand_cols' to True to split multiple lables
346 # into individual columns
347 reviews.llm.extract(
348 col="review",
349 labels=["product", "feelings"],
350 expand_cols=True
351 )
352 ```
354 ```{python}
355 # Set custom names to the resulting columns
356 reviews.llm.extract(
357 col="review",
358 labels={"prod": "product", "feels": "feelings"},
359 expand_cols=True
360 )
361 ```
363 """
365 lab_names = labels
366 lab_vals = labels
367 if isinstance(labels, dict):
368 lab_names = []
369 lab_vals = []
370 for label in labels:
371 lab_names.append(label)
372 lab_vals.append(labels[label])
373 df = map_call(
374 df=self._df,
375 col=col,
376 msg=extract(lab_vals, additional=additional),
377 pred_name=pred_name,
378 use=self._use,
379 )
380 if expand_cols:
381 df = df.with_columns(
382 pl.col("extract")
383 .str.split_exact(n=len(labels) - 1, by="|")
384 .struct.rename_fields(lab_names)
385 ).unnest("extract")
387 return df
389 def custom(
390 self,
391 col,
392 prompt="",
393 valid_resps="",
394 pred_name="custom",
395 ) -> list[pl.DataFrame]:
396 """Provide the full prompt that the LLM will process.
398 Parameters
399 ------
400 col : str
401 The name of the text field to process
403 prompt : str
404 The prompt to send to the LLM along with the `col`
406 pred_name : str
407 A character vector with the name of the new column where the
408 prediction will be placed
411 Examples
412 ------
414 ```{python}
415 my_prompt = (
416 "Answer a question."
417 "Return only the answer, no explanation"
418 "Acceptable answers are 'yes', 'no'"
419 "Answer this about the following text, is this a happy customer?:"
420 )
422 reviews.llm.custom("review", prompt = my_prompt)
423 ```
424 """
425 df = map_call(
426 df=self._df,
427 col=col,
428 msg=custom(prompt),
429 pred_name=pred_name,
430 use=self._use,
431 valid_resps=valid_resps,
432 )
433 return df
435 def verify(
436 self,
437 col,
438 what="",
439 yes_no=[1, 0],
440 additional="",
441 pred_name="verify",
442 ) -> list[pl.DataFrame]:
443 """Check to see if something is true about the text.
445 Parameters
446 ------
447 col : str
448 The name of the text field to process
450 what : str
451 The statement or question that needs to be verified against the
452 provided text
454 yes_no : list
455 A positional list of size 2, which contains the values to return
456 if true and false. The first position will be used as the 'true'
457 value, and the second as the 'false' value
459 pred_name : str
460 A character vector with the name of the new column where the
461 prediction will be placed
463 additional : str
464 Inserts this text into the prompt sent to the LLM
466 Examples
467 ------
469 ```{python}
470 reviews.llm.verify("review", "is the customer happy")
471 ```
473 ```{python}
474 # Use 'yes_no' to modify the 'true' and 'false' values to return
475 reviews.llm.verify("review", "is the customer happy", ["y", "n"])
476 ```
477 """
478 df = map_call(
479 df=self._df,
480 col=col,
481 msg=verify(what, additional=additional),
482 pred_name=pred_name,
483 use=self._use,
484 valid_resps=yes_no,
485 convert=dict(yes=yes_no[0], no=yes_no[1]),
486 )
487 return df