php-llm-agent/lib/actions/wikipedia.action.php
Frederico @ VilaRosa02 436e0e57c5 new version
2025-09-10 11:40:03 +00:00

132 lines
4.0 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
/**
* ACTIONS: Wikipedia Integration
* ------------------------------
* @wikiSummary <Title> → returns the lead/summary paragraph
* @wikiFullArticle <Title> → returns the full article as plain text
*
* Examples:
* @wikiSummary Finland
* @wikiFullArticle Bitcoin
*/
// 1) Register actions
registerAction(
(new Action())
->setFunctionName("wikiSummary")
->setArgumentsTypes(["string"]) // title
);
registerAction(
(new Action())
->setFunctionName("wikiFullArticle")
->setArgumentsTypes(["string"]) // title
);
// --- Helpers ---------------------------------------------------------------
/**
* Basic HTTP GET via cURL.
* @return array [int $httpCode, ?string $body, ?string $err]
*/
function http_get(string $url, int $timeout = 6): array {
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => $timeout,
CURLOPT_CONNECTTIMEOUT => 4,
CURLOPT_USERAGENT => "LLM-Action-Demo/1.0 (+https://example.com)"
]);
$body = curl_exec($ch);
$err = curl_error($ch) ?: null;
$code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
return [$code, $body, $err];
}
/** Safely pick the first (and only) page object from MediaWiki Action API */
function mw_first_page(array $json): ?array {
if (!isset($json['query']['pages']) || !is_array($json['query']['pages'])) return null;
foreach ($json['query']['pages'] as $page) {
return $page; // first element
}
return null;
}
// --- Actions ---------------------------------------------------------------
/**
* @param string $title Wikipedia article title
* @return string
*/
function wikiSummary(string $title): string {
$title = str_replace("-"," ",$title);
$encoded = rawurlencode($title);
$url = "https://en.wikipedia.org/api/rest_v1/page/summary/{$encoded}";
[$code, $body, $err] = http_get($url);
if ($err) return "Error fetching summary for '{$title}': {$err}";
if ($code < 200 || $code >= 300 || !$body) return "HTTP {$code}: Failed to fetch summary for '{$title}'.";
$data = json_decode($body, true);
if (isset($data['extract']) && is_string($data['extract']) && $data['extract'] !== '') {
return $data['extract'];
}
// Common “not found” or disambiguation handling
if (!empty($data['type']) && $data['type'] === 'disambiguation') {
return "{$title} is a disambiguation page. Try a more specific title.";
}
return "No summary found for '{$title}'.";
}
/**
* Returns full article as plain text (sections + paragraphs).
* Uses MediaWiki Action API with extracts (plaintext).
*
* @param string $title
* @return string
*/
function wikiFullArticle(string $title): string {
$title = str_replace("-"," ",$title);
$encoded = rawurlencode($title);
$url = "https://en.wikipedia.org/w/api.php"
. "?action=query"
. "&prop=extracts"
. "&explaintext=1"
. "&exsectionformat=plain"
. "&format=json"
. "&redirects=1"
. "&titles={$encoded}";
[$code, $body, $err] = http_get($url, 12);
if ($err) return "Error fetching article for '{$title}': {$err}";
if ($code < 200 || $code >= 300 || !$body) return "HTTP {$code}: Failed to fetch article for '{$title}'.";
$json = json_decode($body, true);
$page = mw_first_page($json);
if (!$page) {
return "No article found for '{$title}'.";
}
if (isset($page['missing'])) {
return "No article found for '{$title}'.";
}
if (!isset($page['extract']) || trim($page['extract']) === '') {
return "Article exists but has no plain-text extract for '{$title}'.";
}
// Optionally trim extremely long responses (LLM-friendly)
$maxChars = 40000; // adjust for your pipeline
$text = $page['extract'];
if (mb_strlen($text, 'UTF-8') > $maxChars) {
$text = mb_substr($text, 0, $maxChars, 'UTF-8') . "\n\n[Truncated]";
}
return $text;
}