You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

342 lines
8.8 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

<?php
/*
* This file is part of the overtrue/pinyin.
*
* (c) overtrue <i@overtrue.me>
*
* This source file is subject to the MIT license that is bundled
* with this source code in the file LICENSE.
*/
namespace Overtrue\Pinyin;
use InvalidArgumentException;
class Pinyin
{
/**
* Dict loader.
*
* @var \Overtrue\Pinyin\DictLoaderInterface
*/
protected $loader;
/**
* Punctuations map.
*
* @var array
*/
protected $punctuations = [
'' => ',',
'。' => '.',
'' => '!',
'' => '?',
'' => ':',
'“' => '"',
'”' => '"',
'' => "'",
'' => "'",
'_' => '_',
];
/**
* Constructor.
*
* @param string $loaderName
*/
public function __construct($loaderName = null)
{
$this->loader = $loaderName ?: 'Overtrue\\Pinyin\\FileDictLoader';
}
/**
* Convert string to pinyin.
*
* @param string $string
* @param int $option
*
* @return array
*/
public function convert($string, $option = PINYIN_DEFAULT)
{
$pinyin = $this->romanize($string, $option);
return $this->splitWords($pinyin, $option);
}
/**
* Convert string (person name) to pinyin.
*
* @param string $stringName
* @param int $option
*
* @return array
*/
public function name($stringName, $option = PINYIN_NAME)
{
$option = $option | PINYIN_NAME;
$pinyin = $this->romanize($stringName, $option);
return $this->splitWords($pinyin, $option);
}
/**
* Return a pinyin permalink from string.
*
* @param string $string
* @param string $delimiter
* @param int $option
*
* @return string
*/
public function permalink($string, $delimiter = '-', $option = PINYIN_DEFAULT)
{
if (\is_int($delimiter)) {
list($option, $delimiter) = [$delimiter, '-'];
}
if (!in_array($delimiter, ['_', '-', '.', ''], true)) {
throw new InvalidArgumentException("Delimiter must be one of: '_', '-', '', '.'.");
}
return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_NUMBER | \PINYIN_KEEP_ENGLISH));
}
/**
* Return first letters.
*
* @param string $string
* @param string $delimiter
* @param int $option
*
* @return string
*/
public function abbr($string, $delimiter = '', $option = PINYIN_DEFAULT)
{
if (\is_int($delimiter)) {
list($option, $delimiter) = [$delimiter, ''];
}
return implode($delimiter, array_map(function ($pinyin) {
return \is_numeric($pinyin) || preg_match('/\d+/', $pinyin) ? $pinyin : mb_substr($pinyin, 0, 1);
}, $this->convert($string, $option | PINYIN_NO_TONE)));
}
/**
* Chinese phrase to pinyin.
*
* @param string $string
* @param string $delimiter
* @param int $option
*
* @return string
*/
public function phrase($string, $delimiter = ' ', $option = PINYIN_DEFAULT)
{
if (\is_int($delimiter)) {
list($option, $delimiter) = [$delimiter, ' '];
}
return implode($delimiter, $this->convert($string, $option));
}
/**
* Chinese to pinyin sentence.
*
* @param string $string
* @param string $delimiter
* @param int $option
*
* @return string
*/
public function sentence($string, $delimiter = ' ', $option = \PINYIN_NO_TONE)
{
if (\is_int($delimiter)) {
list($option, $delimiter) = [$delimiter, ' '];
}
return implode($delimiter, $this->convert($string, $option | \PINYIN_KEEP_PUNCTUATION | \PINYIN_KEEP_ENGLISH | \PINYIN_KEEP_NUMBER));
}
/**
* Loader setter.
*
* @param \Overtrue\Pinyin\DictLoaderInterface $loader
*
* @return $this
*/
public function setLoader(DictLoaderInterface $loader)
{
$this->loader = $loader;
return $this;
}
/**
* Return dict loader,.
*
* @return \Overtrue\Pinyin\DictLoaderInterface
*/
public function getLoader()
{
if (!($this->loader instanceof DictLoaderInterface)) {
$dataDir = dirname(__DIR__) . '/data/';
$loaderName = $this->loader;
$this->loader = new $loaderName($dataDir);
}
return $this->loader;
}
/**
* Convert Chinese to pinyin.
*
* @param string $string
* @param int $option
*
* @return string
*/
protected function romanize($string, $option = \PINYIN_DEFAULT)
{
$string = $this->prepare($string, $option);
$dictLoader = $this->getLoader();
if ($this->hasOption($option, \PINYIN_NAME)) {
$string = $this->convertSurname($string, $dictLoader);
}
$dictLoader->map(function ($dictionary) use (&$string) {
$string = strtr($string, $dictionary);
});
return $string;
}
/**
* Convert Chinese Surname to pinyin.
*
* @param string $string
* @param \Overtrue\Pinyin\DictLoaderInterface $dictLoader
*
* @return string
*/
protected function convertSurname($string, $dictLoader)
{
$dictLoader->mapSurname(function ($dictionary) use (&$string) {
foreach ($dictionary as $surname => $pinyin) {
if (0 === strpos($string, $surname)) {
$string = $pinyin . mb_substr($string, mb_strlen($surname, 'UTF-8'), mb_strlen($string, 'UTF-8') - 1, 'UTF-8');
break;
}
}
});
return $string;
}
/**
* Split pinyin string to words.
*
* @param string $pinyin
* @param string $option
*
* @return array
*/
protected function splitWords($pinyin, $option)
{
$split = array_filter(preg_split('/\s+/i', $pinyin));
if (!$this->hasOption($option, PINYIN_TONE)) {
foreach ($split as $index => $pinyin) {
$split[$index] = $this->formatTone($pinyin, $option);
}
}
return array_values($split);
}
/**
* @param int $option
* @param int $check
*
* @return bool
*/
public function hasOption($option, $check)
{
return ($option & $check) === $check;
}
/**
* Pre-process.
*
* @param string $string
* @param int $option
*
* @return string
*/
protected function prepare($string, $option = \PINYIN_DEFAULT)
{
$string = preg_replace_callback('~[a-z0-9_-]+~i', function ($matches) {
return "\t" . $matches[0];
}, $string);
$regex = ['\p{Han}', '\p{Z}', '\p{M}', "\t"];
if ($this->hasOption($option, \PINYIN_KEEP_NUMBER)) {
\array_push($regex, '0-9');
}
if ($this->hasOption($option, \PINYIN_KEEP_ENGLISH)) {
\array_push($regex, 'a-zA-Z');
}
if ($this->hasOption($option, \PINYIN_KEEP_PUNCTUATION)) {
$punctuations = array_merge($this->punctuations, ["\t" => ' ', ' ' => ' ']);
$string = trim(str_replace(array_keys($punctuations), $punctuations, $string));
\array_push($regex, preg_quote(implode(array_merge(array_keys($this->punctuations), $this->punctuations)), '~'));
}
return preg_replace(\sprintf('~[^%s]~u', implode($regex)), '', $string);
}
/**
* Format.
*
* @param string $pinyin
* @param int $option
*
* @return string
*/
protected function formatTone($pinyin, $option = \PINYIN_NO_TONE)
{
$replacements = [
'üē' => ['ue', 1], 'üé' => ['ue', 2], 'üě' => ['ue', 3], 'üè' => ['ue', 4],
'ā' => ['a', 1], 'ē' => ['e', 1], 'ī' => ['i', 1], 'ō' => ['o', 1], 'ū' => ['u', 1], 'ǖ' => ['yu', 1],
'á' => ['a', 2], 'é' => ['e', 2], 'í' => ['i', 2], 'ó' => ['o', 2], 'ú' => ['u', 2], 'ǘ' => ['yu', 2],
'ǎ' => ['a', 3], 'ě' => ['e', 3], 'ǐ' => ['i', 3], 'ǒ' => ['o', 3], 'ǔ' => ['u', 3], 'ǚ' => ['yu', 3],
'à' => ['a', 4], 'è' => ['e', 4], 'ì' => ['i', 4], 'ò' => ['o', 4], 'ù' => ['u', 4], 'ǜ' => ['yu', 4],
];
foreach ($replacements as $unicode => $replacement) {
if (false !== strpos($pinyin, $unicode)) {
$umlaut = $replacement[0];
// https://zh.wikipedia.org/wiki/%C3%9C
if ($this->hasOption($option, \PINYIN_UMLAUT_V) && 'yu' == $umlaut) {
$umlaut = 'v';
}
$pinyin = str_replace($unicode, $umlaut, $pinyin) . ($this->hasOption($option, PINYIN_ASCII_TONE) ? $replacement[1] : '');
}
}
return $pinyin;
}
}