datas / test.php
taozi555's picture
Upload folder using huggingface_hub
1ea204c verified
raw
history blame contribute delete
No virus
5.87 kB
<?php
ini_set('memory_limit', '-1');
function readJsonFile($filename) {
$jsonString = file_get_contents($filename);
$data = json_decode($jsonString, true);
return $data;
}
function concatenateConversations($conversations) {
$concatenated = array_reduce($conversations, function($carry, $item) {
return $carry . ' ' . $item['content'];
}, '');
return trim($concatenated);
}
function cosineSimilarity($tokensA, $tokensB) {
$a = $b = $c = 0;
$uniqueTokens = array_unique(array_merge(array_keys($tokensA), array_keys($tokensB)));
foreach ($uniqueTokens as $token) {
$x = isset($tokensA[$token]) ? $tokensA[$token] : 0;
$y = isset($tokensB[$token]) ? $tokensB[$token] : 0;
$a += $x * $y;
$b += $x * $x;
$c += $y * $y;
}
return $b * $c > 0 ? $a / sqrt($b * $c) : 0;
}
function filterConversations($data) {
$conversationsStr = array_map(function($item) {
return ['content' => concatenateConversations($item['conversations']), 'original' => $item];
}, $data);
usort($conversationsStr, function($a, $b) {
return strlen($b['content']) <=> strlen($a['content']);
});
$filteredData = [];
while (!empty($conversationsStr)) {
$last_len = 0;
#is_martch
$is_martch = true;
$last_index = 0;
$longest = array_shift($conversationsStr);
$newConversationsStr = [];
$tokensB = array_count_values(str_word_count($longest['content'], 1));
foreach ($conversationsStr as $index=>$item) {
$tokensA = array_count_values(str_word_count($item['content'], 1));
$similarity = cosineSimilarity($tokensA, $tokensB);
if ($similarity<0.95) {
$newConversationsStr[] = $item;
}else{
$is_martch = False;
$itemCount= count($item['original']['conversations']);
$longestCount= count($longest['original']['conversations']);
if($itemCount>$longestCount){
if($itemCount>$last_len){
$last_len = $itemCount;
$last_index = $index;
}
}else{
if($longestCount>$last_len){
$last_len = $longestCount;
$last_index = $index;
}
}
}
}
if($is_martch){
$filteredData[] = $longest['original'];
}else if($last_index>0){
$filteredData[] = $conversationsStr[$last_index]['original'];
}
$conversationsStr = $newConversationsStr;
print_r("\r".count($conversationsStr));
}
return $filteredData;
}
function writeJsonFile($data, $filename) {
$jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT);
file_put_contents($filename, $jsonData);
}
function main() {
$inputFilename = 'merged_data1716636036.json';
$outputFilename = 'filtered_data.json';
$data = readJsonFile($inputFilename);
echo "Reading data completed.\n".count($data)."数据";
// 切分数据
$parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程
$processes = [];
foreach ($parts as $index => $part) {
$pid = pcntl_fork();
if ($pid == -1) {
die('Could not fork');
} else if ($pid) {
// 父进程
$processes[] = $pid;
} else {
// 子进程
processPart($part, $index);
exit();
}
}
$status = null;
foreach ($processes as $process) {
pcntl_waitpid($process, $status);
}
echo "All processes completed.\n";
// 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/"
$directory = "datasets/";
$allData = [];
// 打开目录,并读取其内容
if ($handle = opendir($directory)) {
while (false !== ($entry = readdir($handle))) {
if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') {
// 读取 JSON 文件
$jsonContent = file_get_contents($directory . $entry);
echo $directory . $entry;
// 解码 JSON 文件内容为数组
$data = json_decode($jsonContent, true);
// 将解码后的数组合并到主数组中
$allData = array_merge($allData, $data);
}
}
closedir($handle);
}
shuffle($allData);
// 编码总数组为 JSON
$finalJson = json_encode($allData, JSON_PRETTY_PRINT);
// 写入最终的 JSON 到一个新文件
file_put_contents("datasets/merged_data.json", $finalJson);
echo "All JSON files have been merged into merged_data.json\n";
$pattern = $directory . '/filtered_data_part_[0-9]*.json';
$files = glob($pattern);
// 遍历文件数组,并逐个删除
foreach ($files as $file) {
if (is_file($file)) {
if (unlink($file)) {
echo "Deleted: $file\n";
} else {
echo "Error deleting: $file\n";
}
}
}
// 源文件路径
$sourceFile = $directory.'merged_data.json';
// 目标目录路径
$destinationFile = 'merged_data'.time().'.json';
// 移动文件
if (rename($sourceFile, $destinationFile)) {
echo "文件成功移动到: $destinationFile";
} else {
echo "文件移动失败!";
}
}
function processPart($dataPart, $index) {
$filteredData = filterConversations($dataPart);
$outputFilename = "datasets/filtered_data_part_$index.json";
writeJsonFile($filteredData, $outputFilename);
echo "Process $index: Writing data completed.\n";
}
main();
?>
?>