0 ? $a / sqrt($b * $c) : 0; } function filterConversations($data) { $conversationsStr = array_map(function($item) { return ['content' => concatenateConversations($item['conversations']), 'original' => $item]; }, $data); usort($conversationsStr, function($a, $b) { return strlen($b['content']) <=> strlen($a['content']); }); $filteredData = []; while (!empty($conversationsStr)) { $last_len = 0; #is_martch $is_martch = true; $last_index = 0; $longest = array_shift($conversationsStr); $newConversationsStr = []; $tokensB = array_count_values(str_word_count($longest['content'], 1)); foreach ($conversationsStr as $index=>$item) { $tokensA = array_count_values(str_word_count($item['content'], 1)); $similarity = cosineSimilarity($tokensA, $tokensB); if ($similarity<0.95) { $newConversationsStr[] = $item; }else{ $is_martch = False; $itemCount= count($item['original']['conversations']); $longestCount= count($longest['original']['conversations']); if($itemCount>$longestCount){ if($itemCount>$last_len){ $last_len = $itemCount; $last_index = $index; } }else{ if($longestCount>$last_len){ $last_len = $longestCount; $last_index = $index; } } } } if($is_martch){ $filteredData[] = $longest['original']; }else if($last_index>0){ $filteredData[] = $conversationsStr[$last_index]['original']; } $conversationsStr = $newConversationsStr; print_r("\r".count($conversationsStr)); } return $filteredData; } function writeJsonFile($data, $filename) { $jsonData = json_encode($data, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT); file_put_contents($filename, $jsonData); } function main() { $inputFilename = 'merged_data1716636036.json'; $outputFilename = 'filtered_data.json'; $data = readJsonFile($inputFilename); echo "Reading data completed.\n".count($data)."数据"; // 切分数据 $parts = array_chunk($data, intval(count($data) / 2)); // 假设启动4个进程 $processes = []; foreach ($parts as $index => $part) { $pid = pcntl_fork(); if ($pid == -1) { die('Could not fork'); } else if ($pid) { // 父进程 $processes[] = $pid; } else { // 子进程 processPart($part, $index); exit(); } } $status = null; foreach ($processes as $process) { pcntl_waitpid($process, $status); } echo "All processes completed.\n"; // 假设所有的 JSON 文件都在同一个目录下,例如 "datasets/" $directory = "datasets/"; $allData = []; // 打开目录,并读取其内容 if ($handle = opendir($directory)) { while (false !== ($entry = readdir($handle))) { if ($entry != "." && $entry != ".." && pathinfo($entry, PATHINFO_EXTENSION) == 'json') { // 读取 JSON 文件 $jsonContent = file_get_contents($directory . $entry); echo $directory . $entry; // 解码 JSON 文件内容为数组 $data = json_decode($jsonContent, true); // 将解码后的数组合并到主数组中 $allData = array_merge($allData, $data); } } closedir($handle); } shuffle($allData); // 编码总数组为 JSON $finalJson = json_encode($allData, JSON_PRETTY_PRINT); // 写入最终的 JSON 到一个新文件 file_put_contents("datasets/merged_data.json", $finalJson); echo "All JSON files have been merged into merged_data.json\n"; $pattern = $directory . '/filtered_data_part_[0-9]*.json'; $files = glob($pattern); // 遍历文件数组,并逐个删除 foreach ($files as $file) { if (is_file($file)) { if (unlink($file)) { echo "Deleted: $file\n"; } else { echo "Error deleting: $file\n"; } } } // 源文件路径 $sourceFile = $directory.'merged_data.json'; // 目标目录路径 $destinationFile = 'merged_data'.time().'.json'; // 移动文件 if (rename($sourceFile, $destinationFile)) { echo "文件成功移动到: $destinationFile"; } else { echo "文件移动失败!"; } } function processPart($dataPart, $index) { $filteredData = filterConversations($dataPart); $outputFilename = "datasets/filtered_data_part_$index.json"; writeJsonFile($filteredData, $outputFilename); echo "Process $index: Writing data completed.\n"; } main(); ?> ?>