Version: v4.1.15

linux/mm/memcontrol.c

     1 
     2 
     3 
     4 
     5 
     6 
     7 
     8 
     9 
    10 
    11 
    12 
    13 
    14 
    15 
    16 
    17 
    18 
    19 
    20 
    21 
    22 
    23 
    24 
    25 
    26 
    27 
    28 
    29 
    30 
    31 
    32 
    33 
    34 
    35 
    36 
    37 
    38 
    39 
    40 
    41 
    42 
    43 
    44 
    45 
    46 
    47 
    48 
    49 
    50 
    51 
    52 
    53 
    54 
    55 
    56 
    57 
    58 
    59 
    60 
    61 
    62 
    63 
    64 
    65 
    66 
    67 
    68 
    69 
    70 
    71 
    72 
    73 
    74 
    75 
    76 
    77 
    78 
    79 
    80 
    81 
    82 
    83 
    84 
    85 
    86 
    87 
    88 
    89 
    90 
    91 
    92 
    93 
    94 
    95 
    96 
    97 
    98 
    99 
   100 
   101 
   102 
   103 
   104 
   105 
   106 
   107 
   108 
   109 
   110 
   111 
   112 
   113 
   114 
   115 
   116 
   117 
   118 
   119 
   120 
   121 
   122 
   123 
   124 
   125 
   126 
   127 
   128 
   129 
   130 
   131 
   132 
   133 
   134 
   135 
   136 
   137 
   138 
   139 
   140 
   141 
   142 
   143 
   144 
   145 
   146 
   147 
   148 
   149 
   150 
   151 
   152 
   153 
   154 
   155 
   156 
   157 
   158 
   159 
   160 
   161 
   162 
   163 
   164 
   165 
   166 
   167 
   168 
   169 
   170 
   171 
   172 
   173 
   174 
   175 
   176 
   177 
   178 
   179 
   180 
   181 
   182 
   183 
   184 
   185 
   186 
   187 
   188 
   189 
   190 
   191 
   192 
   193 
   194 
   195 
   196 
   197 
   198 
   199 
   200 
   201 
   202 
   203 
   204 
   205 
   206 
   207 
   208 
   209 
   210 
   211 
   212 
   213 
   214 
   215 
   216 
   217 
   218 
   219 
   220 
   221 
   222 
   223 
   224 
   225 
   226 
   227 
   228 
   229 
   230 
   231 
   232 
   233 
   234 
   235 
   236 
   237 
   238 
   239 
   240 
   241 
   242 
   243 
   244 
   245 
   246 
   247 
   248 
   249 
   250 
   251 
   252 
   253 
   254 
   255 
   256 
   257 
   258 
   259 
   260 
   261 
   262 
   263 
   264 
   265 
   266 
   267 
   268 
   269 
   270 
   271 
   272 
   273 
   274 
   275 
   276 
   277 
   278 
   279 
   280 
   281 
   282 
   283 
   284 
   285 
   286 
   287 
   288 
   289 
   290 
   291 
   292 
   293 
   294 
   295 
   296 
   297 
   298 
   299 
   300 
   301 
   302 
   303 
   304 
   305 
   306 
   307 
   308 
   309 
   310 
   311 
   312 
   313 
   314 
   315 
   316 
   317 
   318 
   319 
   320 
   321 
   322 
   323 
   324 
   325 
   326 
   327 
   328 
   329 
   330 
   331 
   332 
   333 
   334 
   335 
   336 
   337 
   338 
   339 
   340 
   341 
   342 
   343 
   344 
   345 
   346 
   347 
   348 
   349 
   350 
   351 
   352 
   353 
   354 
   355 
   356 
   357 
   358 
   359 
   360 
   361 
   362 
   363 
   364 
   365 
   366 
   367 
   368 
   369 
   370 
   371 
   372 
   373 
   374 
   375 
   376 
   377 
   378 
   379 
   380 
   381 
   382 
   383 
   384 
   385 
   386 
   387 
   388 
   389 
   390 
   391 
   392 
   393 
   394 
   395 
   396 
   397 
   398 
   399 
   400 
   401 
   402 
   403 
   404 
   405 
   406 
   407 
   408 
   409 
   410 
   411 
   412 
   413 
   414 
   415 
   416 
   417 
   418 
   419 
   420 
   421 
   422 
   423 
   424 
   425 
   426 
   427 
   428 
   429 
   430 
   431 
   432 
   433 
   434 
   435 
   436 
   437 
   438 
   439 
   440 
   441 
   442 
   443 
   444 
   445 
   446 
   447 
   448 
   449 
   450 
   451 
   452 
   453 
   454 
   455 
   456 
   457 
   458 
   459 
   460 
   461 
   462 
   463 
   464 
   465 
   466 
   467 
   468 
   469 
   470 
   471 
   472 
   473 
   474 
   475 
   476 
   477 
   478 
   479 
   480 
   481 
   482 
   483 
   484 
   485 
   486 
   487 
   488 
   489 
   490 
   491 
   492 
   493 
   494 
   495 
   496 
   497 
   498 
   499 
   500 
   501 
   502 
   503 
   504 
   505 
   506 
   507 
   508 
   509 
   510 
   511 
   512 
   513 
   514 
   515 
   516 
   517 
   518 
   519 
   520 
   521 
   522 
   523 
   524 
   525 
   526 
   527 
   528 
   529 
   530 
   531 
   532 
   533 
   534 
   535 
   536 
   537 
   538 
   539 
   540 
   541 
   542 
   543 
   544 
   545 
   546 
   547 
   548 
   549 
   550 
   551 
   552 
   553 
   554 
   555 
   556 
   557 
   558 
   559 
   560 
   561 
   562 
   563 
   564 
   565 
   566 
   567 
   568 
   569 
   570 
   571 
   572 
   573 
   574 
   575 
   576 
   577 
   578 
   579 
   580 
   581 
   582 
   583 
   584 
   585 
   586 
   587 
   588 
   589 
   590 
   591 
   592 
   593 
   594 
   595 
   596 
   597 
   598 
   599 
   600 
   601 
   602 
   603 
   604 
   605 
   606 
   607 
   608 
   609 
   610 
   611 
   612 
   613 
   614 
   615 
   616 
   617 
   618 
   619 
   620 
   621 
   622 
   623 
   624 
   625 
   626 
   627 
   628 
   629 
   630 
   631 
   632 
   633 
   634 
   635 
   636 
   637 
   638 
   639 
   640 
   641 
   642 
   643 
   644 
   645 
   646 
   647 
   648 
   649 
   650 
   651 
   652 
   653 
   654 
   655 
   656 
   657 
   658 
   659 
   660 
   661 
   662 
   663 
   664 
   665 
   666 
   667 
   668 
   669 
   670 
   671 
   672 
   673 
   674 
   675 
   676 
   677 
   678 
   679 
   680 
   681 
   682 
   683 
   684 
   685 
   686 
   687 
   688 
   689 
   690 
   691 
   692 
   693 
   694 
   695 
   696 
   697 
   698 
   699 
   700 
   701 
   702 
   703 
   704 
   705 
   706 
   707 
   708 
   709 
   710 
   711 
   712 
   713 
   714 
   715 
   716 
   717 
   718 
   719 
   720 
   721 
   722 
   723 
   724 
   725 
   726 
   727 
   728 
   729 
   730 
   731 
   732 
   733 
   734 
   735 
   736 
   737 
   738 
   739 
   740 
   741 
   742 
   743 
   744 
   745 
   746 
   747 
   748 
   749 
   750 
   751 
   752 
   753 
   754 
   755 
   756 
   757 
   758 
   759 
   760 
   761 
   762 
   763 
   764 
   765 
   766 
   767 
   768 
   769 
   770 
   771 
   772 
   773 
   774 
   775 
   776 
   777 
   778 
   779 
   780 
   781 
   782 
   783 
   784 
   785 
   786 
   787 
   788 
   789 
   790 
   791 
   792 
   793 
   794 
   795 
   796 
   797 
   798 
   799 
   800 
   801 
   802 
   803 
   804 
   805 
   806 
   807 
   808 
   809 
   810 
   811 
   812 
   813 
   814 
   815 
   816 
   817 
   818 
   819 
   820 
   821 
   822 
   823 
   824 
   825 
   826 
   827 
   828 
   829 
   830 
   831 
   832 
   833 
   834 
   835 
   836 
   837 
   838 
   839 
   840 
   841 
   842 
   843 
   844 
   845 
   846 
   847 
   848 
   849 
   850 
   851 
   852 
   853 
   854 
   855 
   856 
   857 
   858 
   859 
   860 
   861 
   862 
   863 
   864 
   865 
   866 
   867 
   868 
   869 
   870 
   871 
   872 
   873 
   874 
   875 
   876 
   877 
   878 
   879 
   880 
   881 
   882 
   883 
   884 
   885 
   886 
   887 
   888 
   889 
   890 
   891 
   892 
   893 
   894 
   895 
   896 
   897 
   898 
   899 
   900 
   901 
   902 
   903 
   904 
   905 
   906 
   907 
   908 
   909 
   910 
   911 
   912 
   913 
   914 
   915 
   916 
   917 
   918 
   919 
   920 
   921 
   922 
   923 
   924 
   925 
   926 
   927 
   928 
   929 
   930 
   931 
   932 
   933 
   934 
   935 
   936 
   937 
   938 
   939 
   940 
   941 
   942 
   943 
   944 
   945 
   946 
   947 
   948 
   949 
   950 
   951 
   952 
   953 
   954 
   955 
   956 
   957 
   958 
   959 
   960 
   961 
   962 
   963 
   964 
   965 
   966 
   967 
   968 
   969 
   970 
   971 
   972 
   973 
   974 
   975 
   976 
   977 
   978 
   979 
   980 
   981 
   982 
   983 
   984 
   985 
   986 
   987 
   988 
   989 
   990 
   991 
   992 
   993 
   994 
   995 
   996 
   997 
   998 
   999 
  1000 
  1001 
  1002 
  1003 
  1004 
  1005 
  1006 
  1007 
  1008 
  1009 
  1010 
  1011 
  1012 
  1013 
  1014 
  1015 
  1016 
  1017 
  1018 
  1019 
  1020 
  1021 
  1022 
  1023 
  1024 
  1025 
  1026 
  1027 
  1028 
  1029 
  1030 
  1031 
  1032 
  1033 
  1034 
  1035 
  1036 
  1037 
  1038 
  1039 
  1040 
  1041 
  1042 
  1043 
  1044 
  1045 
  1046 
  1047 
  1048 
  1049 
  1050 
  1051 
  1052 
  1053 
  1054 
  1055 
  1056 
  1057 
  1058 
  1059 
  1060 
  1061 
  1062 
  1063 
  1064 
  1065 
  1066 
  1067 
  1068 
  1069 
  1070 
  1071 
  1072 
  1073 
  1074 
  1075 
  1076 
  1077 
  1078 
  1079 
  1080 
  1081 
  1082 
  1083 
  1084 
  1085 
  1086 
  1087 
  1088 
  1089 
  1090 
  1091 
  1092 
  1093 
  1094 
  1095 
  1096 
  1097 
  1098 
  1099 
  1100 
  1101 
  1102 
  1103 
  1104 
  1105 
  1106 
  1107 
  1108 
  1109 
  1110 
  1111 
  1112 
  1113 
  1114 
  1115 
  1116 
  1117 
  1118 
  1119 
  1120 
  1121 
  1122 
  1123 
  1124 
  1125 
  1126 
  1127 
  1128 
  1129 
  1130 
  1131 
  1132 
  1133 
  1134 
  1135 
  1136 
  1137 
  1138 
  1139 
  1140 
  1141 
  1142 
  1143 
  1144 
  1145 
  1146 
  1147 
  1148 
  1149 
  1150 
  1151 
  1152 
  1153 
  1154 
  1155 
  1156 
  1157 
  1158 
  1159 
  1160 
  1161 
  1162 
  1163 
  1164 
  1165 
  1166 
  1167 
  1168 
  1169 
  1170 
  1171 
  1172 
  1173 
  1174 
  1175 
  1176 
  1177 
  1178 
  1179 
  1180 
  1181 
  1182 
  1183 
  1184 
  1185 
  1186 
  1187 
  1188 
  1189 
  1190 
  1191 
  1192 
  1193 
  1194 
  1195 
  1196 
  1197 
  1198 
  1199 
  1200 
  1201 
  1202 
  1203 
  1204 
  1205 
  1206 
  1207 
  1208 
  1209 
  1210 
  1211 
  1212 
  1213 
  1214 
  1215 
  1216 
  1217 
  1218 
  1219 
  1220 
  1221 
  1222 
  1223 
  1224 
  1225 
  1226 
  1227 
  1228 
  1229 
  1230 
  1231 
  1232 
  1233 
  1234 
  1235 
  1236 
  1237 
  1238 
  1239 
  1240 
  1241 
  1242 
  1243 
  1244 
  1245 
  1246 
  1247 
  1248 
  1249 
  1250 
  1251 
  1252 
  1253 
  1254 
  1255 
  1256 
  1257 
  1258 
  1259 
  1260 
  1261 
  1262 
  1263 
  1264 
  1265 
  1266 
  1267 
  1268 
  1269 
  1270 
  1271 
  1272 
  1273 
  1274 
  1275 
  1276 
  1277 
  1278 
  1279 
  1280 
  1281 
  1282 
  1283 
  1284 
  1285 
  1286 
  1287 
  1288 
  1289 
  1290 
  1291 
  1292 
  1293 
  1294 
  1295 
  1296 
  1297 
  1298 
  1299 
  1300 
  1301 
  1302 
  1303 
  1304 
  1305 
  1306 
  1307 
  1308 
  1309 
  1310 
  1311 
  1312 
  1313 
  1314 
  1315 
  1316 
  1317 
  1318 
  1319 
  1320 
  1321 
  1322 
  1323 
  1324 
  1325 
  1326 
  1327 
  1328 
  1329 
  1330 
  1331 
  1332 
  1333 
  1334 
  1335 
  1336 
  1337 
  1338 
  1339 
  1340 
  1341 
  1342 
  1343 
  1344 
  1345 
  1346 
  1347 
  1348 
  1349 
  1350 
  1351 
  1352 
  1353 
  1354 
  1355 
  1356 
  1357 
  1358 
  1359 
  1360 
  1361 
  1362 
  1363 
  1364 
  1365 
  1366 
  1367 
  1368 
  1369 
  1370 
  1371 
  1372 
  1373 
  1374 
  1375 
  1376 
  1377 
  1378 
  1379 
  1380 
  1381 
  1382 
  1383 
  1384 
  1385 
  1386 
  1387 
  1388 
  1389 
  1390 
  1391 
  1392 
  1393 
  1394 
  1395 
  1396 
  1397 
  1398 
  1399 
  1400 
  1401 
  1402 
  1403 
  1404 
  1405 
  1406 
  1407 
  1408 
  1409 
  1410 
  1411 
  1412 
  1413 
  1414 
  1415 
  1416 
  1417 
  1418 
  1419 
  1420 
  1421 
  1422 
  1423 
  1424 
  1425 
  1426 
  1427 
  1428 
  1429 
  1430 
  1431 
  1432 
  1433 
  1434 
  1435 
  1436 
  1437 
  1438 
  1439 
  1440 
  1441 
  1442 
  1443 
  1444 
  1445 
  1446 
  1447 
  1448 
  1449 
  1450 
  1451 
  1452 
  1453 
  1454 
  1455 
  1456 
  1457 
  1458 
  1459 
  1460 
  1461 
  1462 
  1463 
  1464 
  1465 
  1466 
  1467 
  1468 
  1469 
  1470 
  1471 
  1472 
  1473 
  1474 
  1475 
  1476 
  1477 
  1478 
  1479 
  1480 
  1481 
  1482 
  1483 
  1484 
  1485 
  1486 
  1487 
  1488 
  1489 
  1490 
  1491 
  1492 
  1493 
  1494 
  1495 
  1496 
  1497 
  1498 
  1499 
  1500 
  1501 
  1502 
  1503 
  1504 
  1505 
  1506 
  1507 
  1508 
  1509 
  1510 
  1511 
  1512 
  1513 
  1514 
  1515 
  1516 
  1517 
  1518 
  1519 
  1520 
  1521 
  1522 
  1523 
  1524 
  1525 
  1526 
  1527 
  1528 
  1529 
  1530 
  1531 
  1532 
  1533 
  1534 
  1535 
  1536 
  1537 
  1538 
  1539 
  1540 
  1541 
  1542 
  1543 
  1544 
  1545 
  1546 
  1547 
  1548 
  1549 
  1550 
  1551 
  1552 
  1553 
  1554 
  1555 
  1556 
  1557 
  1558 
  1559 
  1560 
  1561 
  1562 
  1563 
  1564 
  1565 
  1566 
  1567 
  1568 
  1569 
  1570 
  1571 
  1572 
  1573 
  1574 
  1575 
  1576 
  1577 
  1578 
  1579 
  1580 
  1581 
  1582 
  1583 
  1584 
  1585 
  1586 
  1587 
  1588 
  1589 
  1590 
  1591 
  1592 
  1593 
  1594 
  1595 
  1596 
  1597 
  1598 
  1599 
  1600 
  1601 
  1602 
  1603 
  1604 
  1605 
  1606 
  1607 
  1608 
  1609 
  1610 
  1611 
  1612 
  1613 
  1614 
  1615 
  1616 
  1617 
  1618 
  1619 
  1620 
  1621 
  1622 
  1623 
  1624 
  1625 
  1626 
  1627 
  1628 
  1629 
  1630 
  1631 
  1632 
  1633 
  1634 
  1635 
  1636 
  1637 
  1638 
  1639 
  1640 
  1641 
  1642 
  1643 
  1644 
  1645 
  1646 
  1647 
  1648 
  1649 
  1650 
  1651 
  1652 
  1653 
  1654 
  1655 
  1656 
  1657 
  1658 
  1659 
  1660 
  1661 
  1662 
  1663 
  1664 
  1665 
  1666 
  1667 
  1668 
  1669 
  1670 
  1671 
  1672 
  1673 
  1674 
  1675 
  1676 
  1677 
  1678 
  1679 
  1680 
  1681 
  1682 
  1683 
  1684 
  1685 
  1686 
  1687 
  1688 
  1689 
  1690 
  1691 
  1692 
  1693 
  1694 
  1695 
  1696 
  1697 
  1698 
  1699 
  1700 
  1701 
  1702 
  1703 
  1704 
  1705 
  1706 
  1707 
  1708 
  1709 
  1710 
  1711 
  1712 
  1713 
  1714 
  1715 
  1716 
  1717 
  1718 
  1719 
  1720 
  1721 
  1722 
  1723 
  1724 
  1725 
  1726 
  1727 
  1728 
  1729 
  1730 
  1731 
  1732 
  1733 
  1734 
  1735 
  1736 
  1737 
  1738 
  1739 
  1740 
  1741 
  1742 
  1743 
  1744 
  1745 
  1746 
  1747 
  1748 
  1749 
  1750 
  1751 
  1752 
  1753 
  1754 
  1755 
  1756 
  1757 
  1758 
  1759 
  1760 
  1761 
  1762 
  1763 
  1764 
  1765 
  1766 
  1767 
  1768 
  1769 
  1770 
  1771 
  1772 
  1773 
  1774 
  1775 
  1776 
  1777 
  1778 
  1779 
  1780 
  1781 
  1782 
  1783 
  1784 
  1785 
  1786 
  1787 
  1788 
  1789 
  1790 
  1791 
  1792 
  1793 
  1794 
  1795 
  1796 
  1797 
  1798 
  1799 
  1800 
  1801 
  1802 
  1803 
  1804 
  1805 
  1806 
  1807 
  1808 
  1809 
  1810 
  1811 
  1812 
  1813 
  1814 
  1815 
  1816 
  1817 
  1818 
  1819 
  1820 
  1821 
  1822 
  1823 
  1824 
  1825 
  1826 
  1827 
  1828 
  1829 
  1830 
  1831 
  1832 
  1833 
  1834 
  1835 
  1836 
  1837 
  1838 
  1839 
  1840 
  1841 
  1842 
  1843 
  1844 
  1845 
  1846 
  1847 
  1848 
  1849 
  1850 
  1851 
  1852 
  1853 
  1854 
  1855 
  1856 
  1857 
  1858 
  1859 
  1860 
  1861 
  1862 
  1863 
  1864 
  1865 
  1866 
  1867 
  1868 
  1869 
  1870 
  1871 
  1872 
  1873 
  1874 
  1875 
  1876 
  1877 
  1878 
  1879 
  1880 
  1881 
  1882 
  1883 
  1884 
  1885 
  1886 
  1887 
  1888 
  1889 
  1890 
  1891 
  1892 
  1893 
  1894 
  1895 
  1896 
  1897 
  1898 
  1899 
  1900 
  1901 
  1902 
  1903 
  1904 
  1905 
  1906 
  1907 
  1908 
  1909 
  1910 
  1911 
  1912 
  1913 
  1914 
  1915 
  1916 
  1917 
  1918 
  1919 
  1920 
  1921 
  1922 
  1923 
  1924 
  1925 
  1926 
  1927 
  1928 
  1929 
  1930 
  1931 
  1932 
  1933 
  1934 
  1935 
  1936 
  1937 
  1938 
  1939 
  1940 
  1941 
  1942 
  1943 
  1944 
  1945 
  1946 
  1947 
  1948 
  1949 
  1950 
  1951 
  1952 
  1953 
  1954 
  1955 
  1956 
  1957 
  1958 
  1959 
  1960 
  1961 
  1962 
  1963 
  1964 
  1965 
  1966 
  1967 
  1968 
  1969 
  1970 
  1971 
  1972 
  1973 
  1974 
  1975 
  1976 
  1977 
  1978 
  1979 
  1980 
  1981 
  1982 
  1983 
  1984 
  1985 
  1986 
  1987 
  1988 
  1989 
  1990 
  1991 
  1992 
  1993 
  1994 
  1995 
  1996 
  1997 
  1998 
  1999 
  2000 
  2001 
  2002 
  2003 
  2004 
  2005 
  2006 
  2007 
  2008 
  2009 
  2010 
  2011 
  2012 
  2013 
  2014 
  2015 
  2016 
  2017 
  2018 
  2019 
  2020 
  2021 
  2022 
  2023 
  2024 
  2025 
  2026 
  2027 
  2028 
  2029 
  2030 
  2031 
  2032 
  2033 
  2034 
  2035 
  2036 
  2037 
  2038 
  2039 
  2040 
  2041 
  2042 
  2043 
  2044 
  2045 
  2046 
  2047 
  2048 
  2049 
  2050 
  2051 
  2052 
  2053 
  2054 
  2055 
  2056 
  2057 
  2058 
  2059 
  2060 
  2061 
  2062 
  2063 
  2064 
  2065 
  2066 
  2067 
  2068 
  2069 
  2070 
  2071 
  2072 
  2073 
  2074 
  2075 
  2076 
  2077 
  2078 
  2079 
  2080 
  2081 
  2082 
  2083 
  2084 
  2085 
  2086 
  2087 
  2088 
  2089 
  2090 
  2091 
  2092 
  2093 
  2094 
  2095 
  2096 
  2097 
  2098 
  2099 
  2100 
  2101 
  2102 
  2103 
  2104 
  2105 
  2106 
  2107 
  2108 
  2109 
  2110 
  2111 
  2112 
  2113 
  2114 
  2115 
  2116 
  2117 
  2118 
  2119 
  2120 
  2121 
  2122 
  2123 
  2124 
  2125 
  2126 
  2127 
  2128 
  2129 
  2130 
  2131 
  2132 
  2133 
  2134 
  2135 
  2136 
  2137 
  2138 
  2139 
  2140 
  2141 
  2142 
  2143 
  2144 
  2145 
  2146 
  2147 
  2148 
  2149 
  2150 
  2151 
  2152 
  2153 
  2154 
  2155 
  2156 
  2157 
  2158 
  2159 
  2160 
  2161 
  2162 
  2163 
  2164 
  2165 
  2166 
  2167 
  2168 
  2169 
  2170 
  2171 
  2172 
  2173 
  2174 
  2175 
  2176 
  2177 
  2178 
  2179 
  2180 
  2181 
  2182 
  2183 
  2184 
  2185 
  2186 
  2187 
  2188 
  2189 
  2190 
  2191 
  2192 
  2193 
  2194 
  2195 
  2196 
  2197 
  2198 
  2199 
  2200 
  2201 
  2202 
  2203 
  2204 
  2205 
  2206 
  2207 
  2208 
  2209 
  2210 
  2211 
  2212 
  2213 
  2214 
  2215 
  2216 
  2217 
  2218 
  2219 
  2220 
  2221 
  2222 
  2223 
  2224 
  2225 
  2226 
  2227 
  2228 
  2229 
  2230 
  2231 
  2232 
  2233 
  2234 
  2235 
  2236 
  2237 
  2238 
  2239 
  2240 
  2241 
  2242 
  2243 
  2244 
  2245 
  2246 
  2247 
  2248 
  2249 
  2250 
  2251 
  2252 
  2253 
  2254 
  2255 
  2256 
  2257 
  2258 
  2259 
  2260 
  2261 
  2262 
  2263 
  2264 
  2265 
  2266 
  2267 
  2268 
  2269 
  2270 
  2271 
  2272 
  2273 
  2274 
  2275 
  2276 
  2277 
  2278 
  2279 
  2280 
  2281 
  2282 
  2283 
  2284 
  2285 
  2286 
  2287 
  2288 
  2289 
  2290 
  2291 
  2292 
  2293 
  2294 
  2295 
  2296 
  2297 
  2298 
  2299 
  2300 
  2301 
  2302 
  2303 
  2304 
  2305 
  2306 
  2307 
  2308 
  2309 
  2310 
  2311 
  2312 
  2313 
  2314 
  2315 
  2316 
  2317 
  2318 
  2319 
  2320 
  2321 
  2322 
  2323 
  2324 
  2325 
  2326 
  2327 
  2328 
  2329 
  2330 
  2331 
  2332 
  2333 
  2334 
  2335 
  2336 
  2337 
  2338 
  2339 
  2340 
  2341 
  2342 
  2343 
  2344 
  2345 
  2346 
  2347 
  2348 
  2349 
  2350 
  2351 
  2352 
  2353 
  2354 
  2355 
  2356 
  2357 
  2358 
  2359 
  2360 
  2361 
  2362 
  2363 
  2364 
  2365 
  2366 
  2367 
  2368 
  2369 
  2370 
  2371 
  2372 
  2373 
  2374 
  2375 
  2376 
  2377 
  2378 
  2379 
  2380 
  2381 
  2382 
  2383 
  2384 
  2385 
  2386 
  2387 
  2388 
  2389 
  2390 
  2391 
  2392 
  2393 
  2394 
  2395 
  2396 
  2397 
  2398 
  2399 
  2400 
  2401 
  2402 
  2403 
  2404 
  2405 
  2406 
  2407 
  2408 
  2409 
  2410 
  2411 
  2412 
  2413 
  2414 
  2415 
  2416 
  2417 
  2418 
  2419 
  2420 
  2421 
  2422 
  2423 
  2424 
  2425 
  2426 
  2427 
  2428 
  2429 
  2430 
  2431 
  2432 
  2433 
  2434 
  2435 
  2436 
  2437 
  2438 
  2439 
  2440 
  2441 
  2442 
  2443 
  2444 
  2445 
  2446 
  2447 
  2448 
  2449 
  2450 
  2451 
  2452 
  2453 
  2454 
  2455 
  2456 
  2457 
  2458 
  2459 
  2460 
  2461 
  2462 
  2463 
  2464 
  2465 
  2466 
  2467 
  2468 
  2469 
  2470 
  2471 
  2472 
  2473 
  2474 
  2475 
  2476 
  2477 
  2478 
  2479 
  2480 
  2481 
  2482 
  2483 
  2484 
  2485 
  2486 
  2487 
  2488 
  2489 
  2490 
  2491 
  2492 
  2493 
  2494 
  2495 
  2496 
  2497 
  2498 
  2499 
  2500 
  2501 
  2502 
  2503 
  2504 
  2505 
  2506 
  2507 
  2508 
  2509 
  2510 
  2511 
  2512 
  2513 
  2514 
  2515 
  2516 
  2517 
  2518 
  2519 
  2520 
  2521 
  2522 
  2523 
  2524 
  2525 
  2526 
  2527 
  2528 
  2529 
  2530 
  2531 
  2532 
  2533 
  2534 
  2535 
  2536 
  2537 
  2538 
  2539 
  2540 
  2541 
  2542 
  2543 
  2544 
  2545 
  2546 
  2547 
  2548 
  2549 
  2550 
  2551 
  2552 
  2553 
  2554 
  2555 
  2556 
  2557 
  2558 
  2559 
  2560 
  2561 
  2562 
  2563 
  2564 
  2565 
  2566 
  2567 
  2568 
  2569 
  2570 
  2571 
  2572 
  2573 
  2574 
  2575 
  2576 
  2577 
  2578 
  2579 
  2580 
  2581 
  2582 
  2583 
  2584 
  2585 
  2586 
  2587 
  2588 
  2589 
  2590 
  2591 
  2592 
  2593 
  2594 
  2595 
  2596 
  2597 
  2598 
  2599 
  2600 
  2601 
  2602 
  2603 
  2604 
  2605 
  2606 
  2607 
  2608 
  2609 
  2610 
  2611 
  2612 
  2613 
  2614 
  2615 
  2616 
  2617 
  2618 
  2619 
  2620 
  2621 
  2622 
  2623 
  2624 
  2625 
  2626 
  2627 
  2628 
  2629 
  2630 
  2631 
  2632 
  2633 
  2634 
  2635 
  2636 
  2637 
  2638 
  2639 
  2640 
  2641 
  2642 
  2643 
  2644 
  2645 
  2646 
  2647 
  2648 
  2649 
  2650 
  2651 
  2652 
  2653 
  2654 
  2655 
  2656 
  2657 
  2658 
  2659 
  2660 
  2661 
  2662 
  2663 
  2664 
  2665 
  2666 
  2667 
  2668 
  2669 
  2670 
  2671 
  2672 
  2673 
  2674 
  2675 
  2676 
  2677 
  2678 
  2679 
  2680 
  2681 
  2682 
  2683 
  2684 
  2685 
  2686 
  2687 
  2688 
  2689 
  2690 
  2691 
  2692 
  2693 
  2694 
  2695 
  2696 
  2697 
  2698 
  2699 
  2700 
  2701 
  2702 
  2703 
  2704 
  2705 
  2706 
  2707 
  2708 
  2709 
  2710 
  2711 
  2712 
  2713 
  2714 
  2715 
  2716 
  2717 
  2718 
  2719 
  2720 
  2721 
  2722 
  2723 
  2724 
  2725 
  2726 
  2727 
  2728 
  2729 
  2730 
  2731 
  2732 
  2733 
  2734 
  2735 
  2736 
  2737 
  2738 
  2739 
  2740 
  2741 
  2742 
  2743 
  2744 
  2745 
  2746 
  2747 
  2748 
  2749 
  2750 
  2751 
  2752 
  2753 
  2754 
  2755 
  2756 
  2757 
  2758 
  2759 
  2760 
  2761 
  2762 
  2763 
  2764 
  2765 
  2766 
  2767 
  2768 
  2769 
  2770 
  2771 
  2772 
  2773 
  2774 
  2775 
  2776 
  2777 
  2778 
  2779 
  2780 
  2781 
  2782 
  2783 
  2784 
  2785 
  2786 
  2787 
  2788 
  2789 
  2790 
  2791 
  2792 
  2793 
  2794 
  2795 
  2796 
  2797 
  2798 
  2799 
  2800 
  2801 
  2802 
  2803 
  2804 
  2805 
  2806 
  2807 
  2808 
  2809 
  2810 
  2811 
  2812 
  2813 
  2814 
  2815 
  2816 
  2817 
  2818 
  2819 
  2820 
  2821 
  2822 
  2823 
  2824 
  2825 
  2826 
  2827 
  2828 
  2829 
  2830 
  2831 
  2832 
  2833 
  2834 
  2835 
  2836 
  2837 
  2838 
  2839 
  2840 
  2841 
  2842 
  2843 
  2844 
  2845 
  2846 
  2847 
  2848 
  2849 
  2850 
  2851 
  2852 
  2853 
  2854 
  2855 
  2856 
  2857 
  2858 
  2859 
  2860 
  2861 
  2862 
  2863 
  2864 
  2865 
  2866 
  2867 
  2868 
  2869 
  2870 
  2871 
  2872 
  2873 
  2874 
  2875 
  2876 
  2877 
  2878 
  2879 
  2880 
  2881 
  2882 
  2883 
  2884 
  2885 
  2886 
  2887 
  2888 
  2889 
  2890 
  2891 
  2892 
  2893 
  2894 
  2895 
  2896 
  2897 
  2898 
  2899 
  2900 
  2901 
  2902 
  2903 
  2904 
  2905 
  2906 
  2907 
  2908 
  2909 
  2910 
  2911 
  2912 
  2913 
  2914 
  2915 
  2916 
  2917 
  2918 
  2919 
  2920 
  2921 
  2922 
  2923 
  2924 
  2925 
  2926 
  2927 
  2928 
  2929 
  2930 
  2931 
  2932 
  2933 
  2934 
  2935 
  2936 
  2937 
  2938 
  2939 
  2940 
  2941 
  2942 
  2943 
  2944 
  2945 
  2946 
  2947 
  2948 
  2949 
  2950 
  2951 
  2952 
  2953 
  2954 
  2955 
  2956 
  2957 
  2958 
  2959 
  2960 
  2961 
  2962 
  2963 
  2964 
  2965 
  2966 
  2967 
  2968 
  2969 
  2970 
  2971 
  2972 
  2973 
  2974 
  2975 
  2976 
  2977 
  2978 
  2979 
  2980 
  2981 
  2982 
  2983 
  2984 
  2985 
  2986 
  2987 
  2988 
  2989 
  2990 
  2991 
  2992 
  2993 
  2994 
  2995 
  2996 
  2997 
  2998 
  2999 
  3000 
  3001 
  3002 
  3003 
  3004 
  3005 
  3006 
  3007 
  3008 
  3009 
  3010 
  3011 
  3012 
  3013 
  3014 
  3015 
  3016 
  3017 
  3018 
  3019 
  3020 
  3021 
  3022 
  3023 
  3024 
  3025 
  3026 
  3027 
  3028 
  3029 
  3030 
  3031 
  3032 
  3033 
  3034 
  3035 
  3036 
  3037 
  3038 
  3039 
  3040 
  3041 
  3042 
  3043 
  3044 
  3045 
  3046 
  3047 
  3048 
  3049 
  3050 
  3051 
  3052 
  3053 
  3054 
  3055 
  3056 
  3057 
  3058 
  3059 
  3060 
  3061 
  3062 
  3063 
  3064 
  3065 
  3066 
  3067 
  3068 
  3069 
  3070 
  3071 
  3072 
  3073 
  3074 
  3075 
  3076 
  3077 
  3078 
  3079 
  3080 
  3081 
  3082 
  3083 
  3084 
  3085 
  3086 
  3087 
  3088 
  3089 
  3090 
  3091 
  3092 
  3093 
  3094 
  3095 
  3096 
  3097 
  3098 
  3099 
  3100 
  3101 
  3102 
  3103 
  3104 
  3105 
  3106 
  3107 
  3108 
  3109 
  3110 
  3111 
  3112 
  3113 
  3114 
  3115 
  3116 
  3117 
  3118 
  3119 
  3120 
  3121 
  3122 
  3123 
  3124 
  3125 
  3126 
  3127 
  3128 
  3129 
  3130 
  3131 
  3132 
  3133 
  3134 
  3135 
  3136 
  3137 
  3138 
  3139 
  3140 
  3141 
  3142 
  3143 
  3144 
  3145 
  3146 
  3147 
  3148 
  3149 
  3150 
  3151 
  3152 
  3153 
  3154 
  3155 
  3156 
  3157 
  3158 
  3159 
  3160 
  3161 
  3162 
  3163 
  3164 
  3165 
  3166 
  3167 
  3168 
  3169 
  3170 
  3171 
  3172 
  3173 
  3174 
  3175 
  3176 
  3177 
  3178 
  3179 
  3180 
  3181 
  3182 
  3183 
  3184 
  3185 
  3186 
  3187 
  3188 
  3189 
  3190 
  3191 
  3192 
  3193 
  3194 
  3195 
  3196 
  3197 
  3198 
  3199 
  3200 
  3201 
  3202 
  3203 
  3204 
  3205 
  3206 
  3207 
  3208 
  3209 
  3210 
  3211 
  3212 
  3213 
  3214 
  3215 
  3216 
  3217 
  3218 
  3219 
  3220 
  3221 
  3222 
  3223 
  3224 
  3225 
  3226 
  3227 
  3228 
  3229 
  3230 
  3231 
  3232 
  3233 
  3234 
  3235 
  3236 
  3237 
  3238 
  3239 
  3240 
  3241 
  3242 
  3243 
  3244 
  3245 
  3246 
  3247 
  3248 
  3249 
  3250 
  3251 
  3252 
  3253 
  3254 
  3255 
  3256 
  3257 
  3258 
  3259 
  3260 
  3261 
  3262 
  3263 
  3264 
  3265 
  3266 
  3267 
  3268 
  3269 
  3270 
  3271 
  3272 
  3273 
  3274 
  3275 
  3276 
  3277 
  3278 
  3279 
  3280 
  3281 
  3282 
  3283 
  3284 
  3285 
  3286 
  3287 
  3288 
  3289 
  3290 
  3291 
  3292 
  3293 
  3294 
  3295 
  3296 
  3297 
  3298 
  3299 
  3300 
  3301 
  3302 
  3303 
  3304 
  3305 
  3306 
  3307 
  3308 
  3309 
  3310 
  3311 
  3312 
  3313 
  3314 
  3315 
  3316 
  3317 
  3318 
  3319 
  3320 
  3321 
  3322 
  3323 
  3324 
  3325 
  3326 
  3327 
  3328 
  3329 
  3330 
  3331 
  3332 
  3333 
  3334 
  3335 
  3336 
  3337 
  3338 
  3339 
  3340 
  3341 
  3342 
  3343 
  3344 
  3345 
  3346 
  3347 
  3348 
  3349 
  3350 
  3351 
  3352 
  3353 
  3354 
  3355 
  3356 
  3357 
  3358 
  3359 
  3360 
  3361 
  3362 
  3363 
  3364 
  3365 
  3366 
  3367 
  3368 
  3369 
  3370 
  3371 
  3372 
  3373 
  3374 
  3375 
  3376 
  3377 
  3378 
  3379 
  3380 
  3381 
  3382 
  3383 
  3384 
  3385 
  3386 
  3387 
  3388 
  3389 
  3390 
  3391 
  3392 
  3393 
  3394 
  3395 
  3396 
  3397 
  3398 
  3399 
  3400 
  3401 
  3402 
  3403 
  3404 
  3405 
  3406 
  3407 
  3408 
  3409 
  3410 
  3411 
  3412 
  3413 
  3414 
  3415 
  3416 
  3417 
  3418 
  3419 
  3420 
  3421 
  3422 
  3423 
  3424 
  3425 
  3426 
  3427 
  3428 
  3429 
  3430 
  3431 
  3432 
  3433 
  3434 
  3435 
  3436 
  3437 
  3438 
  3439 
  3440 
  3441 
  3442 
  3443 
  3444 
  3445 
  3446 
  3447 
  3448 
  3449 
  3450 
  3451 
  3452 
  3453 
  3454 
  3455 
  3456 
  3457 
  3458 
  3459 
  3460 
  3461 
  3462 
  3463 
  3464 
  3465 
  3466 
  3467 
  3468 
  3469 
  3470 
  3471 
  3472 
  3473 
  3474 
  3475 
  3476 
  3477 
  3478 
  3479 
  3480 
  3481 
  3482 
  3483 
  3484 
  3485 
  3486 
  3487 
  3488 
  3489 
  3490 
  3491 
  3492 
  3493 
  3494 
  3495 
  3496 
  3497 
  3498 
  3499 
  3500 
  3501 
  3502 
  3503 
  3504 
  3505 
  3506 
  3507 
  3508 
  3509 
  3510 
  3511 
  3512 
  3513 
  3514 
  3515 
  3516 
  3517 
  3518 
  3519 
  3520 
  3521 
  3522 
  3523 
  3524 
  3525 
  3526 
  3527 
  3528 
  3529 
  3530 
  3531 
  3532 
  3533 
  3534 
  3535 
  3536 
  3537 
  3538 
  3539 
  3540 
  3541 
  3542 
  3543 
  3544 
  3545 
  3546 
  3547 
  3548 
  3549 
  3550 
  3551 
  3552 
  3553 
  3554 
  3555 
  3556 
  3557 
  3558 
  3559 
  3560 
  3561 
  3562 
  3563 
  3564 
  3565 
  3566 
  3567 
  3568 
  3569 
  3570 
  3571 
  3572 
  3573 
  3574 
  3575 
  3576 
  3577 
  3578 
  3579 
  3580 
  3581 
  3582 
  3583 
  3584 
  3585 
  3586 
  3587 
  3588 
  3589 
  3590 
  3591 
  3592 
  3593 
  3594 
  3595 
  3596 
  3597 
  3598 
  3599 
  3600 
  3601 
  3602 
  3603 
  3604 
  3605 
  3606 
  3607 
  3608 
  3609 
  3610 
  3611 
  3612 
  3613 
  3614 
  3615 
  3616 
  3617 
  3618 
  3619 
  3620 
  3621 
  3622 
  3623 
  3624 
  3625 
  3626 
  3627 
  3628 
  3629 
  3630 
  3631 
  3632 
  3633 
  3634 
  3635 
  3636 
  3637 
  3638 
  3639 
  3640 
  3641 
  3642 
  3643 
  3644 
  3645 
  3646 
  3647 
  3648 
  3649 
  3650 
  3651 
  3652 
  3653 
  3654 
  3655 
  3656 
  3657 
  3658 
  3659 
  3660 
  3661 
  3662 
  3663 
  3664 
  3665 
  3666 
  3667 
  3668 
  3669 
  3670 
  3671 
  3672 
  3673 
  3674 
  3675 
  3676 
  3677 
  3678 
  3679 
  3680 
  3681 
  3682 
  3683 
  3684 
  3685 
  3686 
  3687 
  3688 
  3689 
  3690 
  3691 
  3692 
  3693 
  3694 
  3695 
  3696 
  3697 
  3698 
  3699 
  3700 
  3701 
  3702 
  3703 
  3704 
  3705 
  3706 
  3707 
  3708 
  3709 
  3710 
  3711 
  3712 
  3713 
  3714 
  3715 
  3716 
  3717 
  3718 
  3719 
  3720 
  3721 
  3722 
  3723 
  3724 
  3725 
  3726 
  3727 
  3728 
  3729 
  3730 
  3731 
  3732 
  3733 
  3734 
  3735 
  3736 
  3737 
  3738 
  3739 
  3740 
  3741 
  3742 
  3743 
  3744 
  3745 
  3746 
  3747 
  3748 
  3749 
  3750 
  3751 
  3752 
  3753 
  3754 
  3755 
  3756 
  3757 
  3758 
  3759 
  3760 
  3761 
  3762 
  3763 
  3764 
  3765 
  3766 
  3767 
  3768 
  3769 
  3770 
  3771 
  3772 
  3773 
  3774 
  3775 
  3776 
  3777 
  3778 
  3779 
  3780 
  3781 
  3782 
  3783 
  3784 
  3785 
  3786 
  3787 
  3788 
  3789 
  3790 
  3791 
  3792 
  3793 
  3794 
  3795 
  3796 
  3797 
  3798 
  3799 
  3800 
  3801 
  3802 
  3803 
  3804 
  3805 
  3806 
  3807 
  3808 
  3809 
  3810 
  3811 
  3812 
  3813 
  3814 
  3815 
  3816 
  3817 
  3818 
  3819 
  3820 
  3821 
  3822 
  3823 
  3824 
  3825 
  3826 
  3827 
  3828 
  3829 
  3830 
  3831 
  3832 
  3833 
  3834 
  3835 
  3836 
  3837 
  3838 
  3839 
  3840 
  3841 
  3842 
  3843 
  3844 
  3845 
  3846 
  3847 
  3848 
  3849 
  3850 
  3851 
  3852 
  3853 
  3854 
  3855 
  3856 
  3857 
  3858 
  3859 
  3860 
  3861 
  3862 
  3863 
  3864 
  3865 
  3866 
  3867 
  3868 
  3869 
  3870 
  3871 
  3872 
  3873 
  3874 
  3875 
  3876 
  3877 
  3878 
  3879 
  3880 
  3881 
  3882 
  3883 
  3884 
  3885 
  3886 
  3887 
  3888 
  3889 
  3890 
  3891 
  3892 
  3893 
  3894 
  3895 
  3896 
  3897 
  3898 
  3899 
  3900 
  3901 
  3902 
  3903 
  3904 
  3905 
  3906 
  3907 
  3908 
  3909 
  3910 
  3911 
  3912 
  3913 
  3914 
  3915 
  3916 
  3917 
  3918 
  3919 
  3920 
  3921 
  3922 
  3923 
  3924 
  3925 
  3926 
  3927 
  3928 
  3929 
  3930 
  3931 
  3932 
  3933 
  3934 
  3935 
  3936 
  3937 
  3938 
  3939 
  3940 
  3941 
  3942 
  3943 
  3944 
  3945 
  3946 
  3947 
  3948 
  3949 
  3950 
  3951 
  3952 
  3953 
  3954 
  3955 
  3956 
  3957 
  3958 
  3959 
  3960 
  3961 
  3962 
  3963 
  3964 
  3965 
  3966 
  3967 
  3968 
  3969 
  3970 
  3971 
  3972 
  3973 
  3974 
  3975 
  3976 
  3977 
  3978 
  3979 
  3980 
  3981 
  3982 
  3983 
  3984 
  3985 
  3986 
  3987 
  3988 
  3989 
  3990 
  3991 
  3992 
  3993 
  3994 
  3995 
  3996 
  3997 
  3998 
  3999 
  4000 
  4001 
  4002 
  4003 
  4004 
  4005 
  4006 
  4007 
  4008 
  4009 
  4010 
  4011 
  4012 
  4013 
  4014 
  4015 
  4016 
  4017 
  4018 
  4019 
  4020 
  4021 
  4022 
  4023 
  4024 
  4025 
  4026 
  4027 
  4028 
  4029 
  4030 
  4031 
  4032 
  4033 
  4034 
  4035 
  4036 
  4037 
  4038 
  4039 
  4040 
  4041 
  4042 
  4043 
  4044 
  4045 
  4046 
  4047 
  4048 
  4049 
  4050 
  4051 
  4052 
  4053 
  4054 
  4055 
  4056 
  4057 
  4058 
  4059 
  4060 
  4061 
  4062 
  4063 
  4064 
  4065 
  4066 
  4067 
  4068 
  4069 
  4070 
  4071 
  4072 
  4073 
  4074 
  4075 
  4076 
  4077 
  4078 
  4079 
  4080 
  4081 
  4082 
  4083 
  4084 
  4085 
  4086 
  4087 
  4088 
  4089 
  4090 
  4091 
  4092 
  4093 
  4094 
  4095 
  4096 
  4097 
  4098 
  4099 
  4100 
  4101 
  4102 
  4103 
  4104 
  4105 
  4106 
  4107 
  4108 
  4109 
  4110 
  4111 
  4112 
  4113 
  4114 
  4115 
  4116 
  4117 
  4118 
  4119 
  4120 
  4121 
  4122 
  4123 
  4124 
  4125 
  4126 
  4127 
  4128 
  4129 
  4130 
  4131 
  4132 
  4133 
  4134 
  4135 
  4136 
  4137 
  4138 
  4139 
  4140 
  4141 
  4142 
  4143 
  4144 
  4145 
  4146 
  4147 
  4148 
  4149 
  4150 
  4151 
  4152 
  4153 
  4154 
  4155 
  4156 
  4157 
  4158 
  4159 
  4160 
  4161 
  4162 
  4163 
  4164 
  4165 
  4166 
  4167 
  4168 
  4169 
  4170 
  4171 
  4172 
  4173 
  4174 
  4175 
  4176 
  4177 
  4178 
  4179 
  4180 
  4181 
  4182 
  4183 
  4184 
  4185 
  4186 
  4187 
  4188 
  4189 
  4190 
  4191 
  4192 
  4193 
  4194 
  4195 
  4196 
  4197 
  4198 
  4199 
  4200 
  4201 
  4202 
  4203 
  4204 
  4205 
  4206 
  4207 
  4208 
  4209 
  4210 
  4211 
  4212 
  4213 
  4214 
  4215 
  4216 
  4217 
  4218 
  4219 
  4220 
  4221 
  4222 
  4223 
  4224 
  4225 
  4226 
  4227 
  4228 
  4229 
  4230 
  4231 
  4232 
  4233 
  4234 
  4235 
  4236 
  4237 
  4238 
  4239 
  4240 
  4241 
  4242 
  4243 
  4244 
  4245 
  4246 
  4247 
  4248 
  4249 
  4250 
  4251 
  4252 
  4253 
  4254 
  4255 
  4256 
  4257 
  4258 
  4259 
  4260 
  4261 
  4262 
  4263 
  4264 
  4265 
  4266 
  4267 
  4268 
  4269 
  4270 
  4271 
  4272 
  4273 
  4274 
  4275 
  4276 
  4277 
  4278 
  4279 
  4280 
  4281 
  4282 
  4283 
  4284 
  4285 
  4286 
  4287 
  4288 
  4289 
  4290 
  4291 
  4292 
  4293 
  4294 
  4295 
  4296 
  4297 
  4298 
  4299 
  4300 
  4301 
  4302 
  4303 
  4304 
  4305 
  4306 
  4307 
  4308 
  4309 
  4310 
  4311 
  4312 
  4313 
  4314 
  4315 
  4316 
  4317 
  4318 
  4319 
  4320 
  4321 
  4322 
  4323 
  4324 
  4325 
  4326 
  4327 
  4328 
  4329 
  4330 
  4331 
  4332 
  4333 
  4334 
  4335 
  4336 
  4337 
  4338 
  4339 
  4340 
  4341 
  4342 
  4343 
  4344 
  4345 
  4346 
  4347 
  4348 
  4349 
  4350 
  4351 
  4352 
  4353 
  4354 
  4355 
  4356 
  4357 
  4358 
  4359 
  4360 
  4361 
  4362 
  4363 
  4364 
  4365 
  4366 
  4367 
  4368 
  4369 
  4370 
  4371 
  4372 
  4373 
  4374 
  4375 
  4376 
  4377 
  4378 
  4379 
  4380 
  4381 
  4382 
  4383 
  4384 
  4385 
  4386 
  4387 
  4388 
  4389 
  4390 
  4391 
  4392 
  4393 
  4394 
  4395 
  4396 
  4397 
  4398 
  4399 
  4400 
  4401 
  4402 
  4403 
  4404 
  4405 
  4406 
  4407 
  4408 
  4409 
  4410 
  4411 
  4412 
  4413 
  4414 
  4415 
  4416 
  4417 
  4418 
  4419 
  4420 
  4421 
  4422 
  4423 
  4424 
  4425 
  4426 
  4427 
  4428 
  4429 
  4430 
  4431 
  4432 
  4433 
  4434 
  4435 
  4436 
  4437 
  4438 
  4439 
  4440 
  4441 
  4442 
  4443 
  4444 
  4445 
  4446 
  4447 
  4448 
  4449 
  4450 
  4451 
  4452 
  4453 
  4454 
  4455 
  4456 
  4457 
  4458 
  4459 
  4460 
  4461 
  4462 
  4463 
  4464 
  4465 
  4466 
  4467 
  4468 
  4469 
  4470 
  4471 
  4472 
  4473 
  4474 
  4475 
  4476 
  4477 
  4478 
  4479 
  4480 
  4481 
  4482 
  4483 
  4484 
  4485 
  4486 
  4487 
  4488 
  4489 
  4490 
  4491 
  4492 
  4493 
  4494 
  4495 
  4496 
  4497 
  4498 
  4499 
  4500 
  4501 
  4502 
  4503 
  4504 
  4505 
  4506 
  4507 
  4508 
  4509 
  4510 
  4511 
  4512 
  4513 
  4514 
  4515 
  4516 
  4517 
  4518 
  4519 
  4520 
  4521 
  4522 
  4523 
  4524 
  4525 
  4526 
  4527 
  4528 
  4529 
  4530 
  4531 
  4532 
  4533 
  4534 
  4535 
  4536 
  4537 
  4538 
  4539 
  4540 
  4541 
  4542 
  4543 
  4544 
  4545 
  4546 
  4547 
  4548 
  4549 
  4550 
  4551 
  4552 
  4553 
  4554 
  4555 
  4556 
  4557 
  4558 
  4559 
  4560 
  4561 
  4562 
  4563 
  4564 
  4565 
  4566 
  4567 
  4568 
  4569 
  4570 
  4571 
  4572 
  4573 
  4574 
  4575 
  4576 
  4577 
  4578 
  4579 
  4580 
  4581 
  4582 
  4583 
  4584 
  4585 
  4586 
  4587 
  4588 
  4589 
  4590 
  4591 
  4592 
  4593 
  4594 
  4595 
  4596 
  4597 
  4598 
  4599 
  4600 
  4601 
  4602 
  4603 
  4604 
  4605 
  4606 
  4607 
  4608 
  4609 
  4610 
  4611 
  4612 
  4613 
  4614 
  4615 
  4616 
  4617 
  4618 
  4619 
  4620 
  4621 
  4622 
  4623 
  4624 
  4625 
  4626 
  4627 
  4628 
  4629 
  4630 
  4631 
  4632 
  4633 
  4634 
  4635 
  4636 
  4637 
  4638 
  4639 
  4640 
  4641 
  4642 
  4643 
  4644 
  4645 
  4646 
  4647 
  4648 
  4649 
  4650 
  4651 
  4652 
  4653 
  4654 
  4655 
  4656 
  4657 
  4658 
  4659 
  4660 
  4661 
  4662 
  4663 
  4664 
  4665 
  4666 
  4667 
  4668 
  4669 
  4670 
  4671 
  4672 
  4673 
  4674 
  4675 
  4676 
  4677 
  4678 
  4679 
  4680 
  4681 
  4682 
  4683 
  4684 
  4685 
  4686 
  4687 
  4688 
  4689 
  4690 
  4691 
  4692 
  4693 
  4694 
  4695 
  4696 
  4697 
  4698 
  4699 
  4700 
  4701 
  4702 
  4703 
  4704 
  4705 
  4706 
  4707 
  4708 
  4709 
  4710 
  4711 
  4712 
  4713 
  4714 
  4715 
  4716 
  4717 
  4718 
  4719 
  4720 
  4721 
  4722 
  4723 
  4724 
  4725 
  4726 
  4727 
  4728 
  4729 
  4730 
  4731 
  4732 
  4733 
  4734 
  4735 
  4736 
  4737 
  4738 
  4739 
  4740 
  4741 
  4742 
  4743 
  4744 
  4745 
  4746 
  4747 
  4748 
  4749 
  4750 
  4751 
  4752 
  4753 
  4754 
  4755 
  4756 
  4757 
  4758 
  4759 
  4760 
  4761 
  4762 
  4763 
  4764 
  4765 
  4766 
  4767 
  4768 
  4769 
  4770 
  4771 
  4772 
  4773 
  4774 
  4775 
  4776 
  4777 
  4778 
  4779 
  4780 
  4781 
  4782 
  4783 
  4784 
  4785 
  4786 
  4787 
  4788 
  4789 
  4790 
  4791 
  4792 
  4793 
  4794 
  4795 
  4796 
  4797 
  4798 
  4799 
  4800 
  4801 
  4802 
  4803 
  4804 
  4805 
  4806 
  4807 
  4808 
  4809 
  4810 
  4811 
  4812 
  4813 
  4814 
  4815 
  4816 
  4817 
  4818 
  4819 
  4820 
  4821 
  4822 
  4823 
  4824 
  4825 
  4826 
  4827 
  4828 
  4829 
  4830 
  4831 
  4832 
  4833 
  4834 
  4835 
  4836 
  4837 
  4838 
  4839 
  4840 
  4841 
  4842 
  4843 
  4844 
  4845 
  4846 
  4847 
  4848 
  4849 
  4850 
  4851 
  4852 
  4853 
  4854 
  4855 
  4856 
  4857 
  4858 
  4859 
  4860 
  4861 
  4862 
  4863 
  4864 
  4865 
  4866 
  4867 
  4868 
  4869 
  4870 
  4871 
  4872 
  4873 
  4874 
  4875 
  4876 
  4877 
  4878 
  4879 
  4880 
  4881 
  4882 
  4883 
  4884 
  4885 
  4886 
  4887 
  4888 
  4889 
  4890 
  4891 
  4892 
  4893 
  4894 
  4895 
  4896 
  4897 
  4898 
  4899 
  4900 
  4901 
  4902 
  4903 
  4904 
  4905 
  4906 
  4907 
  4908 
  4909 
  4910 
  4911 
  4912 
  4913 
  4914 
  4915 
  4916 
  4917 
  4918 
  4919 
  4920 
  4921 
  4922 
  4923 
  4924 
  4925 
  4926 
  4927 
  4928 
  4929 
  4930 
  4931 
  4932 
  4933 
  4934 
  4935 
  4936 
  4937 
  4938 
  4939 
  4940 
  4941 
  4942 
  4943 
  4944 
  4945 
  4946 
  4947 
  4948 
  4949 
  4950 
  4951 
  4952 
  4953 
  4954 
  4955 
  4956 
  4957 
  4958 
  4959 
  4960 
  4961 
  4962 
  4963 
  4964 
  4965 
  4966 
  4967 
  4968 
  4969 
  4970 
  4971 
  4972 
  4973 
  4974 
  4975 
  4976 
  4977 
  4978 
  4979 
  4980 
  4981 
  4982 
  4983 
  4984 
  4985 
  4986 
  4987 
  4988 
  4989 
  4990 
  4991 
  4992 
  4993 
  4994 
  4995 
  4996 
  4997 
  4998 
  4999 
  5000 
  5001 
  5002 
  5003 
  5004 
  5005 
  5006 
  5007 
  5008 
  5009 
  5010 
  5011 
  5012 
  5013 
  5014 
  5015 
  5016 
  5017 
  5018 
  5019 
  5020 
  5021 
  5022 
  5023 
  5024 
  5025 
  5026 
  5027 
  5028 
  5029 
  5030 
  5031 
  5032 
  5033 
  5034 
  5035 
  5036 
  5037 
  5038 
  5039 
  5040 
  5041 
  5042 
  5043 
  5044 
  5045 
  5046 
  5047 
  5048 
  5049 
  5050 
  5051 
  5052 
  5053 
  5054 
  5055 
  5056 
  5057 
  5058 
  5059 
  5060 
  5061 
  5062 
  5063 
  5064 
  5065 
  5066 
  5067 
  5068 
  5069 
  5070 
  5071 
  5072 
  5073 
  5074 
  5075 
  5076 
  5077 
  5078 
  5079 
  5080 
  5081 
  5082 
  5083 
  5084 
  5085 
  5086 
  5087 
  5088 
  5089 
  5090 
  5091 
  5092 
  5093 
  5094 
  5095 
  5096 
  5097 
  5098 
  5099 
  5100 
  5101 
  5102 
  5103 
  5104 
  5105 
  5106 
  5107 
  5108 
  5109 
  5110 
  5111 
  5112 
  5113 
  5114 
  5115 
  5116 
  5117 
  5118 
  5119 
  5120 
  5121 
  5122 
  5123 
  5124 
  5125 
  5126 
  5127 
  5128 
  5129 
  5130 
  5131 
  5132 
  5133 
  5134 
  5135 
  5136 
  5137 
  5138 
  5139 
  5140 
  5141 
  5142 
  5143 
  5144 
  5145 
  5146 
  5147 
  5148 
  5149 
  5150 
  5151 
  5152 
  5153 
  5154 
  5155 
  5156 
  5157 
  5158 
  5159 
  5160 
  5161 
  5162 
  5163 
  5164 
  5165 
  5166 
  5167 
  5168 
  5169 
  5170 
  5171 
  5172 
  5173 
  5174 
  5175 
  5176 
  5177 
  5178 
  5179 
  5180 
  5181 
  5182 
  5183 
  5184 
  5185 
  5186 
  5187 
  5188 
  5189 
  5190 
  5191 
  5192 
  5193 
  5194 
  5195 
  5196 
  5197 
  5198 
  5199 
  5200 
  5201 
  5202 
  5203 
  5204 
  5205 
  5206 
  5207 
  5208 
  5209 
  5210 
  5211 
  5212 
  5213 
  5214 
  5215 
  5216 
  5217 
  5218 
  5219 
  5220 
  5221 
  5222 
  5223 
  5224 
  5225 
  5226 
  5227 
  5228 
  5229 
  5230 
  5231 
  5232 
  5233 
  5234 
  5235 
  5236 
  5237 
  5238 
  5239 
  5240 
  5241 
  5242 
  5243 
  5244 
  5245 
  5246 
  5247 
  5248 
  5249 
  5250 
  5251 
  5252 
  5253 
  5254 
  5255 
  5256 
  5257 
  5258 
  5259 
  5260 
  5261 
  5262 
  5263 
  5264 
  5265 
  5266 
  5267 
  5268 
  5269 
  5270 
  5271 
  5272 
  5273 
  5274 
  5275 
  5276 
  5277 
  5278 
  5279 
  5280 
  5281 
  5282 
  5283 
  5284 
  5285 
  5286 
  5287 
  5288 
  5289 
  5290 
  5291 
  5292 
  5293 
  5294 
  5295 
  5296 
  5297 
  5298 
  5299 
  5300 
  5301 
  5302 
  5303 
  5304 
  5305 
  5306 
  5307 
  5308 
  5309 
  5310 
  5311 
  5312 
  5313 
  5314 
  5315 
  5316 
  5317 
  5318 
  5319 
  5320 
  5321 
  5322 
  5323 
  5324 
  5325 
  5326 
  5327 
  5328 
  5329 
  5330 
  5331 
  5332 
  5333 
  5334 
  5335 
  5336 
  5337 
  5338 
  5339 
  5340 
  5341 
  5342 
  5343 
  5344 
  5345 
  5346 
  5347 
  5348 
  5349 
  5350 
  5351 
  5352 
  5353 
  5354 
  5355 
  5356 
  5357 
  5358 
  5359 
  5360 
  5361 
  5362 
  5363 
  5364 
  5365 
  5366 
  5367 
  5368 
  5369 
  5370 
  5371 
  5372 
  5373 
  5374 
  5375 
  5376 
  5377 
  5378 
  5379 
  5380 
  5381 
  5382 
  5383 
  5384 
  5385 
  5386 
  5387 
  5388 
  5389 
  5390 
  5391 
  5392 
  5393 
  5394 
  5395 
  5396 
  5397 
  5398 
  5399 
  5400 
  5401 
  5402 
  5403 
  5404 
  5405 
  5406 
  5407 
  5408 
  5409 
  5410 
  5411 
  5412 
  5413 
  5414 
  5415 
  5416 
  5417 
  5418 
  5419 
  5420 
  5421 
  5422 
  5423 
  5424 
  5425 
  5426 
  5427 
  5428 
  5429 
  5430 
  5431 
  5432 
  5433 
  5434 
  5435 
  5436 
  5437 
  5438 
  5439 
  5440 
  5441 
  5442 
  5443 
  5444 
  5445 
  5446 
  5447 
  5448 
  5449 
  5450 
  5451 
  5452 
  5453 
  5454 
  5455 
  5456 
  5457 
  5458 
  5459 
  5460 
  5461 
  5462 
  5463 
  5464 
  5465 
  5466 
  5467 
  5468 
  5469 
  5470 
  5471 
  5472 
  5473 
  5474 
  5475 
  5476 
  5477 
  5478 
  5479 
  5480 
  5481 
  5482 
  5483 
  5484 
  5485 
  5486 
  5487 
  5488 
  5489 
  5490 
  5491 
  5492 
  5493 
  5494 
  5495 
  5496 
  5497 
  5498 
  5499 
  5500 
  5501 
  5502 
  5503 
  5504 
  5505 
  5506 
  5507 
  5508 
  5509 
  5510 
  5511 
  5512 
  5513 
  5514 
  5515 
  5516 
  5517 
  5518 
  5519 
  5520 
  5521 
  5522 
  5523 
  5524 
  5525 
  5526 
  5527 
  5528 
  5529 
  5530 
  5531 
  5532 
  5533 
  5534 
  5535 
  5536 
  5537 
  5538 
  5539 
  5540 
  5541 
  5542 
  5543 
  5544 
  5545 
  5546 
  5547 
  5548 
  5549 
  5550 
  5551 
  5552 
  5553 
  5554 
  5555 
  5556 
  5557 
  5558 
  5559 
  5560 
  5561 
  5562 
  5563 
  5564 
  5565 
  5566 
  5567 
  5568 
  5569 
  5570 
  5571 
  5572 
  5573 
  5574 
  5575 
  5576 
  5577 
  5578 
  5579 
  5580 
  5581 
  5582 
  5583 
  5584 
  5585 
  5586 
  5587 
  5588 
  5589 
  5590 
  5591 
  5592 
  5593 
  5594 
  5595 
  5596 
  5597 
  5598 
  5599 
  5600 
  5601 
  5602 
  5603 
  5604 
  5605 
  5606 
  5607 
  5608 
  5609 
  5610 
  5611 
  5612 
  5613 
  5614 
  5615 
  5616 
  5617 
  5618 
  5619 
  5620 
  5621 
  5622 
  5623 
  5624 
  5625 
  5626 
  5627 
  5628 
  5629 
  5630 
  5631 
  5632 
  5633 
  5634 
  5635 
  5636 
  5637 
  5638 
  5639 
  5640 
  5641 
  5642 
  5643 
  5644 
  5645 
  5646 
  5647 
  5648 
  5649 
  5650 
  5651 
  5652 
  5653 
  5654 
  5655 
  5656 
  5657 
  5658 
  5659 
  5660 
  5661 
  5662 
  5663 
  5664 
  5665 
  5666 
  5667 
  5668 
  5669 
  5670 
  5671 
  5672 
  5673 
  5674 
  5675 
  5676 
  5677 
  5678 
  5679 
  5680 
  5681 
  5682 
  5683 
  5684 
  5685 
  5686 
  5687 
  5688 
  5689 
  5690 
  5691 
  5692 
  5693 
  5694 
  5695 
  5696 
  5697 
  5698 
  5699 
  5700 
  5701 
  5702 
  5703 
  5704 
  5705 
  5706 
  5707 
  5708 
  5709 
  5710 
  5711 
  5712 
  5713 
  5714 
  5715 
  5716 
  5717 
  5718 
  5719 
  5720 
  5721 
  5722 
  5723 
  5724 
  5725 
  5726 
  5727 
  5728 
  5729 
  5730 
  5731 
  5732 
  5733 
  5734 
  5735 
  5736 
  5737 
  5738 
  5739 
  5740 
  5741 
  5742 
  5743 
  5744 
  5745 
  5746 
  5747 
  5748 
  5749 
  5750 
  5751 
  5752 
  5753 
  5754 
  5755 
  5756 
  5757 
  5758 
  5759 
  5760 
  5761 
  5762 
  5763 
  5764 
  5765 
  5766 
  5767 
  5768 
  5769 
  5770 
  5771 
  5772 
  5773 
  5774 
  5775 
  5776 
  5777 
  5778 
  5779 
  5780 
  5781 
  5782 
  5783 
  5784 
  5785 
  5786 
  5787 
  5788 
  5789 
  5790 
  5791 
  5792 
  5793 
  5794 
  5795 
  5796 
  5797 
  5798 
  5799 
  5800 
  5801 
  5802 
  5803 
  5804 
  5805 
  5806 
  5807 
  5808 
  5809 
  5810 
  5811 
  5812 
  5813 
  5814 
  5815 
  5816 
  5817 
  5818 
  5819 
  5820 
  5821 
  5822 
  5823 
  5824 
  5825 
  5826 
  5827 
  5828 
  5829 
  5830 
  5831 
  5832 
  5833 
  5834 
  5835 
  5836 
  5837 
  5838 
  5839 
  5840 
  5841 
  5842 
  5843 
  5844 
  5845 
  5846 
  5847 
  5848 
  5849 
  5850 
  5851 
  5852 
  5853 
  5854 
  5855 
  5856 
  5857 
  5858 
  5859 
  5860 
  5861 
  5862 
  5863 
  5864 
  5865 
  5866 
  5867 
  5868 
  5869 
  5870 
  5871 
  5872 
  5873 
  5874 
  5875 
  5876 
  5877 
  5878 
  5879 
  5880 
  5881 
  5882 
  5883 
  5884 
  5885 
  5886 
  5887 
  5888 
  5889 
  5890 
  5891 
  5892 
  5893 
  5894 
  5895 
  5896 
  5897 
  5898 
  5899 
  5900 
  5901 
  5902 
  5903 
  5904 
  5905 
  5906 
  5907 
  5908 
  5909 
  5910 
  5911 
  5912 
  5913 
  5914 
  5915 
  5916 
  5917 
  5918 
  5919 
  5920 
  5921 
  5922 
  5923 
  5924 
  5925 
/* memcontrol.c - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 *
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
#include "slab.h"

#include <asm/uaccess.h>

#include <trace/events/vmscan.h>

struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);

#define MEM_CGROUP_RECLAIM_RETRIES	5
static struct mem_cgroup *root_mem_cgroup __read_mostly;

/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
int do_swap_account __read_mostly;
#else
#define do_swap_account		0
#endif

static const char * const mem_cgroup_stat_names[] = {
	"cache",
	"rss",
	"rss_huge",
	"mapped_file",
	"writeback",
	"swap",
};

static const char * const mem_cgroup_events_names[] = {
	"pgpgin",
	"pgpgout",
	"pgfault",
	"pgmajfault",
};

static const char * const mem_cgroup_lru_names[] = {
	"inactive_anon",
	"active_anon",
	"inactive_file",
	"active_file",
	"unevictable",
};

/*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
 * for trigger some periodic events. This is straightforward and better
 * than using jiffies etc. to handle periodic memcg event.
 */
enum mem_cgroup_events_target {
	MEM_CGROUP_TARGET_THRESH,
	MEM_CGROUP_TARGET_SOFTLIMIT,
	MEM_CGROUP_TARGET_NUMAINFO,
	MEM_CGROUP_NTARGETS,
};
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
#define NUMAINFO_EVENTS_TARGET	1024

struct mem_cgroup_stat_cpu {
	long count[MEM_CGROUP_STAT_NSTATS];
	unsigned long events[MEMCG_NR_EVENTS];
	unsigned long nr_page_events;
	unsigned long targets[MEM_CGROUP_NTARGETS];
};

struct reclaim_iter {
	struct mem_cgroup *position;
	/* scan generation, increased every round-trip */
	unsigned int generation;
};

/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
	struct lruvec		lruvec;
	unsigned long		lru_size[NR_LRU_LISTS];

	struct reclaim_iter	iter[DEF_PRIORITY + 1];

	struct rb_node		tree_node;	/* RB tree node */
	unsigned long		usage_in_excess;/* Set to the value by which */
						/* the soft limit is exceeded*/
	bool			on_tree;
	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
						/* use container_of	   */
};

struct mem_cgroup_per_node {
	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};

/*
 * Cgroups above their limits are maintained in a RB-Tree, independent of
 * their hierarchy representation
 */

struct mem_cgroup_tree_per_zone {
	struct rb_root rb_root;
	spinlock_t lock;
};

struct mem_cgroup_tree_per_node {
	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
};

struct mem_cgroup_tree {
	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
};

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

struct mem_cgroup_threshold {
	struct eventfd_ctx *eventfd;
	unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
	/* An array index points to threshold just below or equal to usage. */
	int current_threshold;
	/* Size of entries[] */
	unsigned int size;
	/* Array of thresholds */
	struct mem_cgroup_threshold entries[0];
};

struct mem_cgroup_thresholds {
	/* Primary thresholds array */
	struct mem_cgroup_threshold_ary *primary;
	/*
	 * Spare threshold array.
	 * This is needed to make mem_cgroup_unregister_event() "never fail".
	 * It must be able to store at least primary->size - 1 entries.
	 */
	struct mem_cgroup_threshold_ary *spare;
};

/* for OOM */
struct mem_cgroup_eventfd_list {
	struct list_head list;
	struct eventfd_ctx *eventfd;
};

/*
 * cgroup_event represents events which userspace want to receive.
 */
struct mem_cgroup_event {
	/*
	 * memcg which the event belongs to.
	 */
	struct mem_cgroup *memcg;
	/*
	 * eventfd to signal userspace about the event.
	 */
	struct eventfd_ctx *eventfd;
	/*
	 * Each of these stored in a list by the cgroup.
	 */
	struct list_head list;
	/*
	 * register_event() callback will be used to add new userspace
	 * waiter for changes related to this event.  Use eventfd_signal()
	 * on eventfd to send notification to userspace.
	 */
	int (*register_event)(struct mem_cgroup *memcg,
			      struct eventfd_ctx *eventfd, const char *args);
	/*
	 * unregister_event() callback will be called when userspace closes
	 * the eventfd or on cgroup removing.  This callback must be set,
	 * if you want provide notification functionality.
	 */
	void (*unregister_event)(struct mem_cgroup *memcg,
				 struct eventfd_ctx *eventfd);
	/*
	 * All fields below needed to unregister event when
	 * userspace closes eventfd.
	 */
	poll_table pt;
	wait_queue_head_t *wqh;
	wait_queue_t wait;
	struct work_struct remove;
};

static void mem_cgroup_threshold(struct mem_cgroup *memcg);
static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
	struct cgroup_subsys_state css;

	/* Accounted resources */
	struct page_counter memory;
	struct page_counter memsw;
	struct page_counter kmem;

	/* Normal memory consumption range */
	unsigned long low;
	unsigned long high;

	unsigned long soft_limit;

	/* vmpressure notifications */
	struct vmpressure vmpressure;

	/* css_online() has been completed */
	int initialized;

	/*
	 * Should the accounting and control be hierarchical, per subtree?
	 */
	bool use_hierarchy;

	bool		oom_lock;
	atomic_t	under_oom;
	atomic_t	oom_wakeups;

	int	swappiness;
	/* OOM-Killer disable */
	int		oom_kill_disable;

	/* protect arrays of thresholds */
	struct mutex thresholds_lock;

	/* thresholds for memory usage. RCU-protected */
	struct mem_cgroup_thresholds thresholds;

	/* thresholds for mem+swap usage. RCU-protected */
	struct mem_cgroup_thresholds memsw_thresholds;

	/* For oom notifier event fd */
	struct list_head oom_notify;

	/*
	 * Should we move charges of a task when a task is moved into this
	 * mem_cgroup ? And what type of charges should we move ?
	 */
	unsigned long move_charge_at_immigrate;
	/*
	 * set > 0 if pages under this cgroup are moving to other cgroup.
	 */
	atomic_t		moving_account;
	/* taken only while moving_account > 0 */
	spinlock_t		move_lock;
	struct task_struct	*move_lock_task;
	unsigned long		move_lock_flags;
	/*
	 * percpu counter.
	 */
	struct mem_cgroup_stat_cpu __percpu *stat;
	/*
	 * used when a cpu is offlined or other synchronizations
	 * See mem_cgroup_read_stat().
	 */
	struct mem_cgroup_stat_cpu nocpu_base;
	spinlock_t pcp_counter_lock;

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
	struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
	int kmemcg_id;
	bool kmem_acct_activated;
	bool kmem_acct_active;
#endif

	int last_scanned_node;
#if MAX_NUMNODES > 1
	nodemask_t	scan_nodes;
	atomic_t	numainfo_events;
	atomic_t	numainfo_updating;
#endif

	/* List of events which userspace want to receive */
	struct list_head event_list;
	spinlock_t event_list_lock;

	struct mem_cgroup_per_node *nodeinfo[0];
	/* WARNING: nodeinfo must be the last member here */
};

#ifdef CONFIG_MEMCG_KMEM
bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
	return memcg->kmem_acct_active;
}
#endif

/* Stuffs for move charges at task migration. */
/*
 * Types of charges to be moved.
 */
#define MOVE_ANON	0x1U
#define MOVE_FILE	0x2U
#define MOVE_MASK	(MOVE_ANON | MOVE_FILE)

/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
	spinlock_t	  lock; /* for from, to */
	struct mem_cgroup *from;
	struct mem_cgroup *to;
	unsigned long flags;
	unsigned long precharge;
	unsigned long moved_charge;
	unsigned long moved_swap;
	struct task_struct *moving_task;	/* a task moving charges */
	wait_queue_head_t waitq;		/* a waitq for other context */
} mc = {
	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};

/*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
#define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
#define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2

enum charge_type {
	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
	MEM_CGROUP_CHARGE_TYPE_ANON,
	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
	NR_CHARGE_TYPE,
};

/* for encoding cft->private value on file */
enum res_type {
	_MEM,
	_MEMSWAP,
	_OOM_TYPE,
	_KMEM,
};

#define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
#define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val)	((val) & 0xffff)
/* Used for OOM nofiier */
#define OOM_CONTROL		(0)

/*
 * The memcg_create_mutex will be held whenever a new cgroup is created.
 * As a consequence, any change that needs to protect against new child cgroups
 * appearing has to hold it as well.
 */
static DEFINE_MUTEX(memcg_create_mutex);

struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
	return s ? container_of(s, struct mem_cgroup, css) : NULL;
}

/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
	if (!memcg)
		memcg = root_mem_cgroup;
	return &memcg->vmpressure;
}

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
{
	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
	return (memcg == root_mem_cgroup);
}

/*
 * We restrict the id in the range of [1, 65535], so it can fit into
 * an unsigned short.
 */
#define MEM_CGROUP_ID_MAX	USHRT_MAX

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
	return memcg->css.id;
}

/*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock().  The caller is responsible for calling
 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
 * refcnt from swap can be called against removed memcg.)
 */
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
	struct cgroup_subsys_state *css;

	css = css_from_id(id, &memory_cgrp_subsys);
	return mem_cgroup_from_css(css);
}

/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

void sock_update_memcg(struct sock *sk)
{
	if (mem_cgroup_sockets_enabled) {
		struct mem_cgroup *memcg;
		struct cg_proto *cg_proto;

		BUG_ON(!sk->sk_prot->proto_cgroup);

		/* Socket cloning can throw us here with sk_cgrp already
		 * filled. It won't however, necessarily happen from
		 * process context. So the test for root memcg given
		 * the current task's memcg won't help us in this case.
		 *
		 * Respecting the original socket's memcg is a better
		 * decision in this case.
		 */
		if (sk->sk_cgrp) {
			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
			css_get(&sk->sk_cgrp->memcg->css);
			return;
		}

		rcu_read_lock();
		memcg = mem_cgroup_from_task(current);
		cg_proto = sk->sk_prot->proto_cgroup(memcg);
		if (!mem_cgroup_is_root(memcg) &&
		    memcg_proto_active(cg_proto) &&
		    css_tryget_online(&memcg->css)) {
			sk->sk_cgrp = cg_proto;
		}
		rcu_read_unlock();
	}
}
EXPORT_SYMBOL(sock_update_memcg);

void sock_release_memcg(struct sock *sk)
{
	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
		struct mem_cgroup *memcg;
		WARN_ON(!sk->sk_cgrp->memcg);
		memcg = sk->sk_cgrp->memcg;
		css_put(&sk->sk_cgrp->memcg->css);
	}
}

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg || mem_cgroup_is_root(memcg))
		return NULL;

	return &memcg->tcp_mem;
}
EXPORT_SYMBOL(tcp_proto_cgroup);

#endif

#ifdef CONFIG_MEMCG_KMEM
/*
 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
 *
 * The current size of the caches array is stored in memcg_nr_cache_ids. It
 * will double each time we have to increase it.
 */
static DEFINE_IDA(memcg_cache_ida);
int memcg_nr_cache_ids;

/* Protects memcg_nr_cache_ids */
static DECLARE_RWSEM(memcg_cache_ids_sem);

void memcg_get_cache_ids(void)
{
	down_read(&memcg_cache_ids_sem);
}

void memcg_put_cache_ids(void)
{
	up_read(&memcg_cache_ids_sem);
}

/*
 * MIN_SIZE is different than 1, because we would like to avoid going through
 * the alloc/free process all the time. In a small machine, 4 kmem-limited
 * cgroups is a reasonable guess. In the future, it could be a parameter or
 * tunable, but that is strictly not necessary.
 *
 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 * this constant directly from cgroup, but it is understandable that this is
 * better kept as an internal representation in cgroup.c. In any case, the
 * cgrp_id space is not getting any smaller, and we don't have to necessarily
 * increase ours as well if it increases.
 */
#define MEMCG_CACHES_MIN_SIZE 4
#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX

/*
 * A lot of the calls to the cache allocation functions are expected to be
 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 * conditional to this static branch, we'll have to allow modules that does
 * kmem_cache_alloc and the such to see this symbol as well
 */
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);

#endif /* CONFIG_MEMCG_KMEM */

static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
	int nid = zone_to_nid(zone);
	int zid = zone_idx(zone);

	return &memcg->nodeinfo[nid]->zoneinfo[zid];
}

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
{
	return &memcg->css;
}

static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &memcg->nodeinfo[nid]->zoneinfo[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_node_zone(int nid, int zid)
{
	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static struct mem_cgroup_tree_per_zone *
soft_limit_tree_from_page(struct page *page)
{
	int nid = page_to_nid(page);
	int zid = page_zonenum(page);

	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
}

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz,
					 unsigned long new_usage_in_excess)
{
	struct rb_node **p = &mctz->rb_root.rb_node;
	struct rb_node *parent = NULL;
	struct mem_cgroup_per_zone *mz_node;

	if (mz->on_tree)
		return;

	mz->usage_in_excess = new_usage_in_excess;
	if (!mz->usage_in_excess)
		return;
	while (*p) {
		parent = *p;
		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
					tree_node);
		if (mz->usage_in_excess < mz_node->usage_in_excess)
			p = &(*p)->rb_left;
		/*
		 * We can't avoid mem cgroups that are over their soft
		 * limit by the same amount
		 */
		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
			p = &(*p)->rb_right;
	}
	rb_link_node(&mz->tree_node, parent, p);
	rb_insert_color(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = true;
}

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
					 struct mem_cgroup_tree_per_zone *mctz)
{
	if (!mz->on_tree)
		return;
	rb_erase(&mz->tree_node, &mctz->rb_root);
	mz->on_tree = false;
}

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
				       struct mem_cgroup_tree_per_zone *mctz)
{
	unsigned long flags;

	spin_lock_irqsave(&mctz->lock, flags);
	__mem_cgroup_remove_exceeded(mz, mctz);
	spin_unlock_irqrestore(&mctz->lock, flags);
}

static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
	unsigned long nr_pages = page_counter_read(&memcg->memory);
	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
	unsigned long excess = 0;

	if (nr_pages > soft_limit)
		excess = nr_pages - soft_limit;

	return excess;
}

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
{
	unsigned long excess;
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup_tree_per_zone *mctz;

	mctz = soft_limit_tree_from_page(page);
	/*
	 * Necessary to update all ancestors when hierarchy is used.
	 * because their event counter is not touched.
	 */
	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
		mz = mem_cgroup_page_zoneinfo(memcg, page);
		excess = soft_limit_excess(memcg);
		/*
		 * We have to update the tree if mz is on RB-tree or
		 * mem is over its softlimit.
		 */
		if (excess || mz->on_tree) {
			unsigned long flags;

			spin_lock_irqsave(&mctz->lock, flags);
			/* if on-tree, remove it */
			if (mz->on_tree)
				__mem_cgroup_remove_exceeded(mz, mctz);
			/*
			 * Insert again. mz->usage_in_excess will be updated.
			 * If excess is 0, no tree ops.
			 */
			__mem_cgroup_insert_exceeded(mz, mctz, excess);
			spin_unlock_irqrestore(&mctz->lock, flags);
		}
	}
}

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
{
	struct mem_cgroup_tree_per_zone *mctz;
	struct mem_cgroup_per_zone *mz;
	int nid, zid;

	for_each_node(nid) {
		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			mctz = soft_limit_tree_node_zone(nid, zid);
			mem_cgroup_remove_exceeded(mz, mctz);
		}
	}
}

static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct rb_node *rightmost = NULL;
	struct mem_cgroup_per_zone *mz;

retry:
	mz = NULL;
	rightmost = rb_last(&mctz->rb_root);
	if (!rightmost)
		goto done;		/* Nothing to reclaim from */

	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
	/*
	 * Remove the node now but someone else can add it back,
	 * we will to add it back at the end of reclaim to its correct
	 * position in the tree.
	 */
	__mem_cgroup_remove_exceeded(mz, mctz);
	if (!soft_limit_excess(mz->memcg) ||
	    !css_tryget_online(&mz->memcg->css))
		goto retry;
done:
	return mz;
}

static struct mem_cgroup_per_zone *
mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
	struct mem_cgroup_per_zone *mz;

	spin_lock_irq(&mctz->lock);
	mz = __mem_cgroup_largest_soft_limit_node(mctz);
	spin_unlock_irq(&mctz->lock);
	return mz;
}

/*
 * Implementation Note: reading percpu statistics for memcg.
 *
 * Both of vmstat[] and percpu_counter has threshold and do periodic
 * synchronization to implement "quick" read. There are trade-off between
 * reading cost and precision of value. Then, we may have a chance to implement
 * a periodic synchronizion of counter in memcg's counter.
 *
 * But this _read() function is used for user interface now. The user accounts
 * memory usage by memory cgroup and he _always_ requires exact value because
 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 * have to visit all online cpus and make sum. So, for now, unnecessary
 * synchronization is not implemented. (just implemented for cpu hotplug)
 *
 * If there are kernel internal actions which can make use of some not-exact
 * value, and reading all cpu value can be performance bottleneck in some
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
				 enum mem_cgroup_stat_index idx)
{
	long val = 0;
	int cpu;

	get_online_cpus();
	for_each_online_cpu(cpu)
		val += per_cpu(memcg->stat->count[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.count[idx];
	spin_unlock(&memcg->pcp_counter_lock);
#endif
	put_online_cpus();
	return val;
}

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
					    enum mem_cgroup_events_index idx)
{
	unsigned long val = 0;
	int cpu;

	get_online_cpus();
	for_each_online_cpu(cpu)
		val += per_cpu(memcg->stat->events[idx], cpu);
#ifdef CONFIG_HOTPLUG_CPU
	spin_lock(&memcg->pcp_counter_lock);
	val += memcg->nocpu_base.events[idx];
	spin_unlock(&memcg->pcp_counter_lock);
#endif
	put_online_cpus();
	return val;
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
					 struct page *page,
					 int nr_pages)
{
	/*
	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
	 * counted as CACHE even if it's on ANON LRU.
	 */
	if (PageAnon(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
				nr_pages);
	else
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
				nr_pages);

	if (PageTransHuge(page))
		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
				nr_pages);

	/* pagein of a big page is an event. So, ignore page size */
	if (nr_pages > 0)
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
	else {
		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
		nr_pages = -nr_pages; /* for event */
	}

	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}

unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
	struct mem_cgroup_per_zone *mz;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	return mz->lru_size[lru];
}

static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
						  int nid,
						  unsigned int lru_mask)
{
	unsigned long nr = 0;
	int zid;

	VM_BUG_ON((unsigned)nid >= nr_node_ids);

	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
		struct mem_cgroup_per_zone *mz;
		enum lru_list lru;

		for_each_lru(lru) {
			if (!(BIT(lru) & lru_mask))
				continue;
			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
			nr += mz->lru_size[lru];
		}
	}
	return nr;
}

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
			unsigned int lru_mask)
{
	unsigned long nr = 0;
	int nid;

	for_each_node_state(nid, N_MEMORY)
		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
	return nr;
}

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
				       enum mem_cgroup_events_target target)
{
	unsigned long val, next;

	val = __this_cpu_read(memcg->stat->nr_page_events);
	next = __this_cpu_read(memcg->stat->targets[target]);
	/* from time_after() in jiffies.h */
	if ((long)next - (long)val < 0) {
		switch (target) {
		case MEM_CGROUP_TARGET_THRESH:
			next = val + THRESHOLDS_EVENTS_TARGET;
			break;
		case MEM_CGROUP_TARGET_SOFTLIMIT:
			next = val + SOFTLIMIT_EVENTS_TARGET;
			break;
		case MEM_CGROUP_TARGET_NUMAINFO:
			next = val + NUMAINFO_EVENTS_TARGET;
			break;
		default:
			break;
		}
		__this_cpu_write(memcg->stat->targets[target], next);
		return true;
	}
	return false;
}

/*
 * Check events in order.
 *
 */
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
{
	/* threshold event is triggered in finer grain than soft limit */
	if (unlikely(mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_THRESH))) {
		bool do_softlimit;
		bool do_numainfo __maybe_unused;

		do_softlimit = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_SOFTLIMIT);
#if MAX_NUMNODES > 1
		do_numainfo = mem_cgroup_event_ratelimit(memcg,
						MEM_CGROUP_TARGET_NUMAINFO);
#endif
		mem_cgroup_threshold(memcg);
		if (unlikely(do_softlimit))
			mem_cgroup_update_tree(memcg, page);
#if MAX_NUMNODES > 1
		if (unlikely(do_numainfo))
			atomic_inc(&memcg->numainfo_events);
#endif
	}
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
	/*
	 * mm_update_next_owner() may clear mm->owner to NULL
	 * if it races with swapoff, page migration, etc.
	 * So this can be called with p == NULL.
	 */
	if (unlikely(!p))
		return NULL;

	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
	struct mem_cgroup *memcg = NULL;

	rcu_read_lock();
	do {
		/*
		 * Page cache insertions can happen withou an
		 * actual mm context, e.g. during disk probing
		 * on boot, loopback IO, acct() writes etc.
		 */
		if (unlikely(!mm))
			memcg = root_mem_cgroup;
		else {
			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
			if (unlikely(!memcg))
				memcg = root_mem_cgroup;
		}
	} while (!css_tryget_online(&memcg->css));
	rcu_read_unlock();
	return memcg;
}

/**
 * mem_cgroup_iter - iterate over memory cgroup hierarchy
 * @root: hierarchy root
 * @prev: previously returned memcg, NULL on first invocation
 * @reclaim: cookie for shared reclaim walks, NULL for full walks
 *
 * Returns references to children of the hierarchy below @root, or
 * @root itself, or %NULL after a full round-trip.
 *
 * Caller must pass the return value in @prev on subsequent
 * invocations for reference counting, or use mem_cgroup_iter_break()
 * to cancel a hierarchy walk before the round-trip is complete.
 *
 * Reclaimers can specify a zone and a priority level in @reclaim to
 * divide up the memcgs in the hierarchy among all concurrent
 * reclaimers operating on the same zone and priority.
 */
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
				   struct mem_cgroup *prev,
				   struct mem_cgroup_reclaim_cookie *reclaim)
{
	struct reclaim_iter *uninitialized_var(iter);
	struct cgroup_subsys_state *css = NULL;
	struct mem_cgroup *memcg = NULL;
	struct mem_cgroup *pos = NULL;

	if (mem_cgroup_disabled())
		return NULL;

	if (!root)
		root = root_mem_cgroup;

	if (prev && !reclaim)
		pos = prev;

	if (!root->use_hierarchy && root != root_mem_cgroup) {
		if (prev)
			goto out;
		return root;
	}

	rcu_read_lock();

	if (reclaim) {
		struct mem_cgroup_per_zone *mz;

		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
		iter = &mz->iter[reclaim->priority];

		if (prev && reclaim->generation != iter->generation)
			goto out_unlock;

		do {
			pos = READ_ONCE(iter->position);
			/*
			 * A racing update may change the position and
			 * put the last reference, hence css_tryget(),
			 * or retry to see the updated position.
			 */
		} while (pos && !css_tryget(&pos->css));
	}

	if (pos)
		css = &pos->css;

	for (;;) {
		css = css_next_descendant_pre(css, &root->css);
		if (!css) {
			/*
			 * Reclaimers share the hierarchy walk, and a
			 * new one might jump in right at the end of
			 * the hierarchy - make sure they see at least
			 * one group and restart from the beginning.
			 */
			if (!prev)
				continue;
			break;
		}

		/*
		 * Verify the css and acquire a reference.  The root
		 * is provided by the caller, so we know it's alive
		 * and kicking, and don't take an extra reference.
		 */
		memcg = mem_cgroup_from_css(css);

		if (css == &root->css)
			break;

		if (css_tryget(css)) {
			/*
			 * Make sure the memcg is initialized:
			 * mem_cgroup_css_online() orders the the
			 * initialization against setting the flag.
			 */
			if (smp_load_acquire(&memcg->initialized))
				break;

			css_put(css);
		}

		memcg = NULL;
	}

	if (reclaim) {
		if (cmpxchg(&iter->position, pos, memcg) == pos) {
			if (memcg)
				css_get(&memcg->css);
			if (pos)
				css_put(&pos->css);
		}

		/*
		 * pairs with css_tryget when dereferencing iter->position
		 * above.
		 */
		if (pos)
			css_put(&pos->css);

		if (!memcg)
			iter->generation++;
		else if (!prev)
			reclaim->generation = iter->generation;
	}

out_unlock:
	rcu_read_unlock();
out:
	if (prev && prev != root)
		css_put(&prev->css);

	return memcg;
}

/**
 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 * @root: hierarchy root
 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 */
void mem_cgroup_iter_break(struct mem_cgroup *root,
			   struct mem_cgroup *prev)
{
	if (!root)
		root = root_mem_cgroup;
	if (prev && prev != root)
		css_put(&prev->css);
}

/*
 * Iteration constructs for visiting all cgroups (under a tree).  If
 * loops are exited prematurely (break), mem_cgroup_iter_break() must
 * be used for reference counting.
 */
#define for_each_mem_cgroup_tree(iter, root)		\
	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
	     iter != NULL;				\
	     iter = mem_cgroup_iter(root, iter, NULL))

#define for_each_mem_cgroup(iter)			\
	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
	     iter != NULL;				\
	     iter = mem_cgroup_iter(NULL, iter, NULL))

void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
	struct mem_cgroup *memcg;

	rcu_read_lock();
	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
	if (unlikely(!memcg))
		goto out;

	switch (idx) {
	case PGFAULT:
		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
		break;
	case PGMAJFAULT:
		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
		break;
	default:
		BUG();
	}
out:
	rcu_read_unlock();
}
EXPORT_SYMBOL(__mem_cgroup_count_vm_event);

/**
 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
 * @zone: zone of the wanted lruvec
 * @memcg: memcg of the wanted lruvec
 *
 * Returns the lru list vector holding pages for the given @zone and
 * @mem.  This can be the global zone lruvec, if the memory controller
 * is disabled.
 */
struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
				      struct mem_cgroup *memcg)
{
	struct mem_cgroup_per_zone *mz;
	struct lruvec *lruvec;

	if (mem_cgroup_disabled()) {
		lruvec = &zone->lruvec;
		goto out;
	}

	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
	lruvec = &mz->lruvec;
out:
	/*
	 * Since a node can be onlined after the mem_cgroup was created,
	 * we have to be prepared to initialize lruvec->zone here;
	 * and if offlined then reonlined, we need to reinitialize it.
	 */
	if (unlikely(lruvec->zone != zone))
		lruvec->zone = zone;
	return lruvec;
}

/**
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 * @page: the page
 * @zone: zone of the page
 *
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
 */
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup *memcg;
	struct lruvec *lruvec;

	if (mem_cgroup_disabled()) {
		lruvec = &zone->lruvec;
		goto out;
	}

	memcg = page->mem_cgroup;
	/*
	 * Swapcache readahead pages are added to the LRU - and
	 * possibly migrated - before they are charged.
	 */
	if (!memcg)
		memcg = root_mem_cgroup;

	mz = mem_cgroup_page_zoneinfo(memcg, page);
	lruvec = &mz->lruvec;
out:
	/*
	 * Since a node can be onlined after the mem_cgroup was created,
	 * we have to be prepared to initialize lruvec->zone here;
	 * and if offlined then reonlined, we need to reinitialize it.
	 */
	if (unlikely(lruvec->zone != zone))
		lruvec->zone = zone;
	return lruvec;
}

/**
 * mem_cgroup_update_lru_size - account for adding or removing an lru page
 * @lruvec: mem_cgroup per zone lru vector
 * @lru: index of lru list the page is sitting on
 * @nr_pages: positive when adding or negative when removing
 *
 * This function must be called when a page is added to or removed from an
 * lru list.
 */
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
				int nr_pages)
{
	struct mem_cgroup_per_zone *mz;
	unsigned long *lru_size;

	if (mem_cgroup_disabled())
		return;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	lru_size = mz->lru_size + lru;
	*lru_size += nr_pages;
	VM_BUG_ON((long)(*lru_size) < 0);
}

bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
{
	if (root == memcg)
		return true;
	if (!root->use_hierarchy)
		return false;
	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
{
	struct mem_cgroup *task_memcg;
	struct task_struct *p;
	bool ret;

	p = find_lock_task_mm(task);
	if (p) {
		task_memcg = get_mem_cgroup_from_mm(p->mm);
		task_unlock(p);
	} else {
		/*
		 * All threads may have already detached their mm's, but the oom
		 * killer still needs to detect if they have already been oom
		 * killed to prevent needlessly killing additional tasks.
		 */
		rcu_read_lock();
		task_memcg = mem_cgroup_from_task(task);
		css_get(&task_memcg->css);
		rcu_read_unlock();
	}
	ret = mem_cgroup_is_descendant(task_memcg, memcg);
	css_put(&task_memcg->css);
	return ret;
}

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
{
	unsigned long inactive_ratio;
	unsigned long inactive;
	unsigned long active;
	unsigned long gb;

	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

	gb = (inactive + active) >> (30 - PAGE_SHIFT);
	if (gb)
		inactive_ratio = int_sqrt(10 * gb);
	else
		inactive_ratio = 1;

	return inactive * inactive_ratio < active;
}

bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
{
	struct mem_cgroup_per_zone *mz;
	struct mem_cgroup *memcg;

	if (mem_cgroup_disabled())
		return true;

	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
	memcg = mz->memcg;

	return !!(memcg->css.flags & CSS_ONLINE);
}

#define mem_cgroup_from_counter(counter, member)	\
	container_of(counter, struct mem_cgroup, member)

/**
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
 * @memcg: the memory cgroup
 *
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
 */
static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
{
	unsigned long margin = 0;
	unsigned long count;
	unsigned long limit;

	count = page_counter_read(&memcg->memory);
	limit = READ_ONCE(memcg->memory.limit);
	if (count < limit)
		margin = limit - count;

	if (do_swap_account) {
		count = page_counter_read(&memcg->memsw);
		limit = READ_ONCE(memcg->memsw.limit);
		if (count <= limit)
			margin = min(margin, limit - count);
	}

	return margin;
}

int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
	/* root ? */
	if (mem_cgroup_disabled() || !memcg->css.parent)
		return vm_swappiness;

	return memcg->swappiness;
}

/*
 * A routine for checking "mem" is under move_account() or not.
 *
 * Checking a cgroup is mc.from or mc.to or under hierarchy of
 * moving cgroups. This is for waiting at high-memory pressure
 * caused by "move".
 */
static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
{
	struct mem_cgroup *from;
	struct mem_cgroup *to;
	bool ret = false;
	/*
	 * Unlike task_move routines, we access mc.to, mc.from not under
	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
	 */
	spin_lock(&mc.lock);
	from = mc.from;
	to = mc.to;
	if (!from)
		goto unlock;

	ret = mem_cgroup_is_descendant(from, memcg) ||
		mem_cgroup_is_descendant(to, memcg);
unlock:
	spin_unlock(&mc.lock);
	return ret;
}

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
{
	if (mc.moving_task && current != mc.moving_task) {
		if (mem_cgroup_under_move(memcg)) {
			DEFINE_WAIT(wait);
			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
			/* moving charge context might have finished. */
			if (mc.moving_task)
				schedule();
			finish_wait(&mc.waitq, &wait);
			return true;
		}
	}
	return false;
}

#define K(x) ((x) << (PAGE_SHIFT-10))
/**
 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
 * @memcg: The memory cgroup that went over limit
 * @p: Task that is going to be killed
 *
 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
 * enabled
 */
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
	/* oom_info_lock ensures that parallel ooms do not interleave */
	static DEFINE_MUTEX(oom_info_lock);
	struct mem_cgroup *iter;
	unsigned int i;

	mutex_lock(&oom_info_lock);
	rcu_read_lock();

	if (p) {
		pr_info("Task in ");
		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
		pr_cont(" killed as a result of limit of ");
	} else {
		pr_info("Memory limit reached of cgroup ");
	}

	pr_cont_cgroup_path(memcg->css.cgroup);
	pr_cont("\n");

	rcu_read_unlock();

	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
		K((u64)page_counter_read(&memcg->memory)),
		K((u64)memcg->memory.limit), memcg->memory.failcnt);
	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
		K((u64)page_counter_read(&memcg->memsw)),
		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
		K((u64)page_counter_read(&memcg->kmem)),
		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);

	for_each_mem_cgroup_tree(iter, memcg) {
		pr_info("Memory cgroup stats for ");
		pr_cont_cgroup_path(iter->css.cgroup);
		pr_cont(":");

		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
				continue;
			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
				K(mem_cgroup_read_stat(iter, i)));
		}

		for (i = 0; i < NR_LRU_LISTS; i++)
			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));

		pr_cont("\n");
	}
	mutex_unlock(&oom_info_lock);
}

/*
 * This function returns the number of memcg under hierarchy tree. Returns
 * 1(self count) if no children.
 */
static int mem_cgroup_count_children(struct mem_cgroup *memcg)
{
	int num = 0;
	struct mem_cgroup *iter;

	for_each_mem_cgroup_tree(iter, memcg)
		num++;
	return num;
}

/*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
	unsigned long limit;

	limit = memcg->memory.limit;
	if (mem_cgroup_swappiness(memcg)) {
		unsigned long memsw_limit;

		memsw_limit = memcg->memsw.limit;
		limit = min(limit + total_swap_pages, memsw_limit);
	}
	return limit;
}

static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
				     int order)
{
	struct mem_cgroup *iter;
	unsigned long chosen_points = 0;
	unsigned long totalpages;
	unsigned int points = 0;
	struct task_struct *chosen = NULL;

	/*
	 * If current has a pending SIGKILL or is exiting, then automatically
	 * select it.  The goal is to allow it to allocate so that it may
	 * quickly exit and free its memory.
	 */
	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
		mark_tsk_oom_victim(current);
		return;
	}

	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
	for_each_mem_cgroup_tree(iter, memcg) {
		struct css_task_iter it;
		struct task_struct *task;

		css_task_iter_start(&iter->css, &it);
		while ((task = css_task_iter_next(&it))) {
			switch (oom_scan_process_thread(task, totalpages, NULL,
							false)) {
			case OOM_SCAN_SELECT:
				if (chosen)
					put_task_struct(chosen);
				chosen = task;
				chosen_points = ULONG_MAX;
				get_task_struct(chosen);
				/* fall through */
			case OOM_SCAN_CONTINUE:
				continue;
			case OOM_SCAN_ABORT:
				css_task_iter_end(&it);
				mem_cgroup_iter_break(memcg, iter);
				if (chosen)
					put_task_struct(chosen);
				return;
			case OOM_SCAN_OK:
				break;
			};
			points = oom_badness(task, memcg, NULL, totalpages);
			if (!points || points < chosen_points)
				continue;
			/* Prefer thread group leaders for display purposes */
			if (points == chosen_points &&
			    thread_group_leader(chosen))
				continue;

			if (chosen)
				put_task_struct(chosen);
			chosen = task;
			chosen_points = points;
			get_task_struct(chosen);
		}
		css_task_iter_end(&it);
	}

	if (!chosen)
		return;
	points = chosen_points * 1000 / totalpages;
	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
			 NULL, "Memory cgroup out of memory");
}

#if MAX_NUMNODES > 1

/**
 * test_mem_cgroup_node_reclaimable
 * @memcg: the target memcg
 * @nid: the node ID to be checked.
 * @noswap : specify true here if the user wants flle only information.
 *
 * This function returns whether the specified memcg contains any
 * reclaimable pages on a node. Returns true if there are any reclaimable
 * pages in the node.
 */
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
		int nid, bool noswap)
{
	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
		return true;
	if (noswap || !total_swap_pages)
		return false;
	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
		return true;
	return false;

}

/*
 * Always updating the nodemask is not very good - even if we have an empty
 * list or the wrong list here, we can start from some node and traverse all
 * nodes based on the zonelist. So update the list loosely once per 10 secs.
 *
 */
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
{
	int nid;
	/*
	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
	 * pagein/pageout changes since the last update.
	 */
	if (!atomic_read(&memcg->numainfo_events))
		return;
	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
		return;

	/* make a nodemask where this memcg uses memory from */
	memcg->scan_nodes = node_states[N_MEMORY];

	for_each_node_mask(nid, node_states[N_MEMORY]) {

		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
			node_clear(nid, memcg->scan_nodes);
	}

	atomic_set(&memcg->numainfo_events, 0);
	atomic_set(&memcg->numainfo_updating, 0);
}

/*
 * Selecting a node where we start reclaim from. Because what we need is just
 * reducing usage counter, start from anywhere is O,K. Considering
 * memory reclaim from current node, there are pros. and cons.
 *
 * Freeing memory from current node means freeing memory from a node which
 * we'll use or we've used. So, it may make LRU bad. And if several threads
 * hit limits, it will see a contention on a node. But freeing from remote
 * node means more costs for memory reclaim because of memory latency.
 *
 * Now, we use round-robin. Better algorithm is welcomed.
 */
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
	int node;

	mem_cgroup_may_update_nodemask(memcg);
	node = memcg->last_scanned_node;

	node = next_node(node, memcg->scan_nodes);
	if (node == MAX_NUMNODES)
		node = first_node(memcg->scan_nodes);
	/*
	 * We call this when we hit limit, not when pages are added to LRU.
	 * No LRU may hold pages because all pages are UNEVICTABLE or
	 * memcg is too small and all pages are not on LRU. In that case,
	 * we use curret node.
	 */
	if (unlikely(node == MAX_NUMNODES))
		node = numa_node_id();

	memcg->last_scanned_node = node;
	return node;
}
#else
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
{
	return 0;
}
#endif

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
				   struct zone *zone,
				   gfp_t gfp_mask,
				   unsigned long *total_scanned)
{
	struct mem_cgroup *victim = NULL;
	int total = 0;
	int loop = 0;
	unsigned long excess;
	unsigned long nr_scanned;
	struct mem_cgroup_reclaim_cookie reclaim = {
		.zone = zone,
		.priority = 0,
	};

	excess = soft_limit_excess(root_memcg);

	while (1) {
		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
		if (!victim) {
			loop++;
			if (loop >= 2) {
				/*
				 * If we have not been able to reclaim
				 * anything, it might because there are
				 * no reclaimable pages under this hierarchy
				 */
				if (!total)
					break;
				/*
				 * We want to do more targeted reclaim.
				 * excess >> 2 is not to excessive so as to
				 * reclaim too much, nor too less that we keep
				 * coming back to reclaim from this cgroup
				 */
				if (total >= (excess >> 2) ||
					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
					break;
			}
			continue;
		}
		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
						     zone, &nr_scanned);
		*total_scanned += nr_scanned;
		if (!soft_limit_excess(root_memcg))
			break;
	}
	mem_cgroup_iter_break(root_memcg, victim);
	return total;
}

#ifdef CONFIG_LOCKDEP
static struct lockdep_map memcg_oom_lock_dep_map = {
	.name = "memcg_oom_lock",
};
#endif

static DEFINE_SPINLOCK(memcg_oom_lock);

/*
 * Check OOM-Killer is already running under our hierarchy.
 * If someone is running, return false.
 */
static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
{
	struct mem_cgroup *iter, *failed = NULL;

	spin_lock(&memcg_oom_lock);

	for_each_mem_cgroup_tree(iter, memcg) {
		if (iter->oom_lock) {
			/*
			 * this subtree of our hierarchy is already locked
			 * so we cannot give a lock.
			 */
			failed = iter;
			mem_cgroup_iter_break(memcg, iter);
			break;
		} else
			iter->oom_lock = true;
	}

	if (failed) {
		/*
		 * OK, we failed to lock the whole subtree so we have
		 * to clean up what we set up to the failing subtree
		 */
		for_each_mem_cgroup_tree(iter, memcg) {
			if (iter == failed) {
				mem_cgroup_iter_break(memcg, iter);
				break;
			}
			iter->oom_lock = false;
		}
	} else
		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

	spin_unlock(&memcg_oom_lock);

	return !failed;
}

static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
{
	struct mem_cgroup *iter;

	spin_lock(&memcg_oom_lock);
	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
	for_each_mem_cgroup_tree(iter, memcg)
		iter->oom_lock = false;
	spin_unlock(&memcg_oom_lock);
}

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
	struct mem_cgroup *iter;

	for_each_mem_cgroup_tree(iter, memcg)
		atomic_inc(&iter->under_oom);
}

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
{
	struct mem_cgroup *iter;

	/*
	 * When a new child is created while the hierarchy is under oom,
	 * mem_cgroup_oom_lock() may not be called. We have to use
	 * atomic_add_unless() here.
	 */
	for_each_mem_cgroup_tree(iter, memcg)
		atomic_add_unless(&iter->under_oom, -1, 0);
}

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

struct oom_wait_info {
	struct mem_cgroup *memcg;
	wait_queue_t	wait;
};

static int memcg_oom_wake_function(wait_queue_t *wait,
	unsigned mode, int sync, void *arg)
{
	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
	struct mem_cgroup *oom_wait_memcg;
	struct oom_wait_info *oom_wait_info;

	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
	oom_wait_memcg = oom_wait_info->memcg;

	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
		return 0;
	return autoremove_wake_function(wait, mode, sync, arg);
}

static void memcg_wakeup_oom(struct mem_cgroup *memcg)
{
	atomic_inc(&memcg->oom_wakeups);
	/* for filtering, pass "memcg" as argument. */
	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}

static void memcg_oom_recover(struct mem_cgroup *memcg)
{
	if (memcg && atomic_read(&memcg->under_oom))
		memcg_wakeup_oom(memcg);
}

static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
	if (!current->memcg_oom.may_oom)
		return;
	/*
	 * We are in the middle of the charge context here, so we
	 * don't want to block when potentially sitting on a callstack
	 * that holds all kinds of filesystem and mm locks.
	 *
	 * Also, the caller may handle a failed allocation gracefully
	 * (like optional page cache readahead) and so an OOM killer
	 * invocation might not even be necessary.
	 *
	 * That's why we don't do anything here except remember the
	 * OOM context and then deal with it at the end of the page
	 * fault when the stack is unwound, the locks are released,
	 * and when we know whether the fault was overall successful.
	 */
	css_get(&memcg->css);
	current->memcg_oom.memcg = memcg;
	current->memcg_oom.gfp_mask = mask;
	current->memcg_oom.order = order;
}

/**
 * mem_cgroup_oom_synchronize - complete memcg OOM handling
 * @handle: actually kill/wait or just clean up the OOM state
 *
 * This has to be called at the end of a page fault if the memcg OOM
 * handler was enabled.
 *
 * Memcg supports userspace OOM handling where failed allocations must
 * sleep on a waitqueue until the userspace task resolves the
 * situation.  Sleeping directly in the charge context with all kinds
 * of locks held is not a good idea, instead we remember an OOM state
 * in the task and mem_cgroup_oom_synchronize() has to be called at
 * the end of the page fault to complete the OOM handling.
 *
 * Returns %true if an ongoing memcg OOM situation was detected and
 * completed, %false otherwise.
 */
bool mem_cgroup_oom_synchronize(bool handle)
{
	struct mem_cgroup *memcg = current->memcg_oom.memcg;
	struct oom_wait_info owait;
	bool locked;

	/* OOM is global, do not handle */
	if (!memcg)
		return false;

	if (!handle || oom_killer_disabled)
		goto cleanup;

	owait.memcg = memcg;
	owait.wait.flags = 0;
	owait.wait.func = memcg_oom_wake_function;
	owait.wait.private = current;
	INIT_LIST_HEAD(&owait.wait.task_list);

	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
	mem_cgroup_mark_under_oom(memcg);

	locked = mem_cgroup_oom_trylock(memcg);

	if (locked)
		mem_cgroup_oom_notify(memcg);

	if (locked && !memcg->oom_kill_disable) {
		mem_cgroup_unmark_under_oom(memcg);
		finish_wait(&memcg_oom_waitq, &owait.wait);
		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
					 current->memcg_oom.order);
	} else {
		schedule();
		mem_cgroup_unmark_under_oom(memcg);
		finish_wait(&memcg_oom_waitq, &owait.wait);
	}

	if (locked) {
		mem_cgroup_oom_unlock(memcg);
		/*
		 * There is no guarantee that an OOM-lock contender
		 * sees the wakeups triggered by the OOM kill
		 * uncharges.  Wake any sleepers explicitely.
		 */
		memcg_oom_recover(memcg);
	}
cleanup:
	current->memcg_oom.memcg = NULL;
	css_put(&memcg->css);
	return true;
}

/**
 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
 * @page: page that is going to change accounted state
 *
 * This function must mark the beginning of an accounted page state
 * change to prevent double accounting when the page is concurrently
 * being moved to another memcg:
 *
 *   memcg = mem_cgroup_begin_page_stat(page);
 *   if (TestClearPageState(page))
 *     mem_cgroup_update_page_stat(memcg, state, -1);
 *   mem_cgroup_end_page_stat(memcg);
 */
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned long flags;

	/*
	 * The RCU lock is held throughout the transaction.  The fast
	 * path can get away without acquiring the memcg->move_lock
	 * because page moving starts with an RCU grace period.
	 *
	 * The RCU lock also protects the memcg from being freed when
	 * the page state that is going to change is the only thing
	 * preventing the page from being uncharged.
	 * E.g. end-writeback clearing PageWriteback(), which allows
	 * migration to go ahead and uncharge the page before the
	 * account transaction might be complete.
	 */
	rcu_read_lock();

	if (mem_cgroup_disabled())
		return NULL;
again:
	memcg = page->mem_cgroup;
	if (unlikely(!memcg))
		return NULL;

	if (atomic_read(&memcg->moving_account) <= 0)
		return memcg;

	spin_lock_irqsave(&memcg->move_lock, flags);
	if (memcg != page->mem_cgroup) {
		spin_unlock_irqrestore(&memcg->move_lock, flags);
		goto again;
	}

	/*
	 * When charge migration first begins, we can have locked and
	 * unlocked page stat updates happening concurrently.  Track
	 * the task who has the lock for mem_cgroup_end_page_stat().
	 */
	memcg->move_lock_task = current;
	memcg->move_lock_flags = flags;

	return memcg;
}

/**
 * mem_cgroup_end_page_stat - finish a page state statistics transaction
 * @memcg: the memcg that was accounted against
 */
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
	if (memcg && memcg->move_lock_task == current) {
		unsigned long flags = memcg->move_lock_flags;

		memcg->move_lock_task = NULL;
		memcg->move_lock_flags = 0;

		spin_unlock_irqrestore(&memcg->move_lock, flags);
	}

	rcu_read_unlock();
}

/**
 * mem_cgroup_update_page_stat - update page state statistics
 * @memcg: memcg to account against
 * @idx: page state item to account
 * @val: number of pages (positive or negative)
 *
 * See mem_cgroup_begin_page_stat() for locking requirements.
 */
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
				 enum mem_cgroup_stat_index idx, int val)
{
	VM_BUG_ON(!rcu_read_lock_held());

	if (memcg)
		this_cpu_add(memcg->stat->count[idx], val);
}

/*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
 */
#define CHARGE_BATCH	32U
struct memcg_stock_pcp {
	struct mem_cgroup *cached; /* this never be root cgroup */
	unsigned int nr_pages;
	struct work_struct work;
	unsigned long flags;
#define FLUSHING_CACHED_CHARGE	0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);

/**
 * consume_stock: Try to consume stocked charge on this cpu.
 * @memcg: memcg to consume from.
 * @nr_pages: how many pages to charge.
 *
 * The charges will only happen if @memcg matches the current cpu's memcg
 * stock, and at least @nr_pages are available in that stock.  Failure to
 * service an allocation will refill the stock.
 *
 * returns true if successful, false otherwise.
 */
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
	struct memcg_stock_pcp *stock;
	bool ret = false;

	if (nr_pages > CHARGE_BATCH)
		return ret;

	stock = &get_cpu_var(memcg_stock);
	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
		stock->nr_pages -= nr_pages;
		ret = true;
	}
	put_cpu_var(memcg_stock);
	return ret;
}

/*
 * Returns stocks cached in percpu and reset cached information.
 */
static void drain_stock(struct memcg_stock_pcp *stock)
{
	struct mem_cgroup *old = stock->cached;

	if (stock->nr_pages) {
		page_counter_uncharge(&old->memory, stock->nr_pages);
		if (do_swap_account)
			page_counter_uncharge(&old->memsw, stock->nr_pages);
		css_put_many(&old->css, stock->nr_pages);
		stock->nr_pages = 0;
	}
	stock->cached = NULL;
}

/*
 * This must be called under preempt disabled or must be called by
 * a thread which is pinned to local cpu.
 */
static void drain_local_stock(struct work_struct *dummy)
{
	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
	drain_stock(stock);
	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}

/*
 * Cache charges(val) to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
 */
static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

	if (stock->cached != memcg) { /* reset if necessary */
		drain_stock(stock);
		stock->cached = memcg;
	}
	stock->nr_pages += nr_pages;
	put_cpu_var(memcg_stock);
}

/*
 * Drains all per-CPU charge caches for given root_memcg resp. subtree
 * of the hierarchy under it.
 */
static void drain_all_stock(struct mem_cgroup *root_memcg)
{
	int cpu, curcpu;

	/* If someone's already draining, avoid adding running more workers. */
	if (!mutex_trylock(&percpu_charge_mutex))
		return;
	/* Notify other cpus that system-wide "drain" is running */
	get_online_cpus();
	curcpu = get_cpu();
	for_each_online_cpu(cpu) {
		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
		struct mem_cgroup *memcg;

		memcg = stock->cached;
		if (!memcg || !stock->nr_pages)
			continue;
		if (!mem_cgroup_is_descendant(memcg, root_memcg))
			continue;
		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
			if (cpu == curcpu)
				drain_local_stock(&stock->work);
			else
				schedule_work_on(cpu, &stock->work);
		}
	}
	put_cpu();
	put_online_cpus();
	mutex_unlock(&percpu_charge_mutex);
}

/*
 * This function drains percpu counter value from DEAD cpu and
 * move it to local cpu. Note that this function can be preempted.
 */
static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
{
	int i;

	spin_lock(&memcg->pcp_counter_lock);
	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
		long x = per_cpu(memcg->stat->count[i], cpu);

		per_cpu(memcg->stat->count[i], cpu) = 0;
		memcg->nocpu_base.count[i] += x;
	}
	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
		unsigned long x = per_cpu(memcg->stat->events[i], cpu);

		per_cpu(memcg->stat->events[i], cpu) = 0;
		memcg->nocpu_base.events[i] += x;
	}
	spin_unlock(&memcg->pcp_counter_lock);
}

static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
					unsigned long action,
					void *hcpu)
{
	int cpu = (unsigned long)hcpu;
	struct memcg_stock_pcp *stock;
	struct mem_cgroup *iter;

	if (action == CPU_ONLINE)
		return NOTIFY_OK;

	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
		return NOTIFY_OK;

	for_each_mem_cgroup(iter)
		mem_cgroup_drain_pcp_counter(iter, cpu);

	stock = &per_cpu(memcg_stock, cpu);
	drain_stock(stock);
	return NOTIFY_OK;
}

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
		      unsigned int nr_pages)
{
	unsigned int batch = max(CHARGE_BATCH, nr_pages);
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	struct mem_cgroup *mem_over_limit;
	struct page_counter *counter;
	unsigned long nr_reclaimed;
	bool may_swap = true;
	bool drained = false;
	int ret = 0;

	if (mem_cgroup_is_root(memcg))
		goto done;
retry:
	if (consume_stock(memcg, nr_pages))
		goto done;

	if (!do_swap_account ||
	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
			goto done_restock;
		if (do_swap_account)
			page_counter_uncharge(&memcg->memsw, batch);
		mem_over_limit = mem_cgroup_from_counter(counter, memory);
	} else {
		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
		may_swap = false;
	}

	if (batch > nr_pages) {
		batch = nr_pages;
		goto retry;
	}

	/*
	 * Unlike in global OOM situations, memcg is not in a physical
	 * memory shortage.  Allow dying and OOM-killed tasks to
	 * bypass the last charges so that they can exit quickly and
	 * free their memory.
	 */
	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
		     fatal_signal_pending(current) ||
		     current->flags & PF_EXITING))
		goto bypass;

	if (unlikely(task_in_memcg_oom(current)))
		goto nomem;

	if (!(gfp_mask & __GFP_WAIT))
		goto nomem;

	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);

	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
						    gfp_mask, may_swap);

	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
		goto retry;

	if (!drained) {
		drain_all_stock(mem_over_limit);
		drained = true;
		goto retry;
	}

	if (gfp_mask & __GFP_NORETRY)
		goto nomem;
	/*
	 * Even though the limit is exceeded at this point, reclaim
	 * may have been able to free some pages.  Retry the charge
	 * before killing the task.
	 *
	 * Only for regular pages, though: huge pages are rather
	 * unlikely to succeed so close to the limit, and we fall back
	 * to regular pages anyway in case of failure.
	 */
	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
		goto retry;
	/*
	 * At task move, charge accounts can be doubly counted. So, it's
	 * better to wait until the end of task_move if something is going on.
	 */
	if (mem_cgroup_wait_acct_move(mem_over_limit))
		goto retry;

	if (nr_retries--)
		goto retry;

	if (gfp_mask & __GFP_NOFAIL)
		goto bypass;

	if (fatal_signal_pending(current))
		goto bypass;

	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);

	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
nomem:
	if (!(gfp_mask & __GFP_NOFAIL))
		return -ENOMEM;
bypass:
	return -EINTR;

done_restock:
	css_get_many(&memcg->css, batch);
	if (batch > nr_pages)
		refill_stock(memcg, batch - nr_pages);
	if (!(gfp_mask & __GFP_WAIT))
		goto done;
	/*
	 * If the hierarchy is above the normal consumption range,
	 * make the charging task trim their excess contribution.
	 */
	do {
		if (page_counter_read(&memcg->memory) <= memcg->high)
			continue;
		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
	} while ((memcg = parent_mem_cgroup(memcg)));
done:
	return ret;
}

static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
	if (mem_cgroup_is_root(memcg))
		return;

	page_counter_uncharge(&memcg->memory, nr_pages);
	if (do_swap_account)
		page_counter_uncharge(&memcg->memsw, nr_pages);

	css_put_many(&memcg->css, nr_pages);
}

/*
 * try_get_mem_cgroup_from_page - look up page's memcg association
 * @page: the page
 *
 * Look up, get a css reference, and return the memcg that owns @page.
 *
 * The page must be locked to prevent racing with swap-in and page
 * cache charges.  If coming from an unlocked page table, the caller
 * must ensure the page is on the LRU or this can race with charging.
 */
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
{
	struct mem_cgroup *memcg;
	unsigned short id;
	swp_entry_t ent;

	VM_BUG_ON_PAGE(!PageLocked(page), page);

	memcg = page->mem_cgroup;
	if (memcg) {
		if (!css_tryget_online(&memcg->css))
			memcg = NULL;
	} else if (PageSwapCache(page)) {
		ent.val = page_private(page);
		id = lookup_swap_cgroup_id(ent);
		rcu_read_lock();
		memcg = mem_cgroup_from_id(id);
		if (memcg && !css_tryget_online(&memcg->css))
			memcg = NULL;
		rcu_read_unlock();
	}
	return memcg;
}

static void lock_page_lru(struct page *page, int *isolated)
{
	struct zone *zone = page_zone(page);

	spin_lock_irq(&zone->lru_lock);
	if (PageLRU(page)) {
		struct lruvec *lruvec;

		lruvec = mem_cgroup_page_lruvec(page, zone);
		ClearPageLRU(page);
		del_page_from_lru_list(page, lruvec, page_lru(page));
		*isolated = 1;
	} else
		*isolated = 0;
}

static void unlock_page_lru(struct page *page, int isolated)
{
	struct zone *zone = page_zone(page);

	if (isolated) {
		struct lruvec *lruvec;

		lruvec = mem_cgroup_page_lruvec(page, zone);
		VM_BUG_ON_PAGE(PageLRU(page), page);
		SetPageLRU(page);
		add_page_to_lru_list(page, lruvec, page_lru(page));
	}
	spin_unlock_irq(&zone->lru_lock);
}

static void commit_charge(struct page *page, struct mem_cgroup *memcg,
			  bool lrucare)
{
	int isolated;

	VM_BUG_ON_PAGE(page->mem_cgroup, page);

	/*
	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
	 * may already be on some other mem_cgroup's LRU.  Take care of it.
	 */
	if (lrucare)
		lock_page_lru(page, &isolated);

	/*
	 * Nobody should be changing or seriously looking at
	 * page->mem_cgroup at this point:
	 *
	 * - the page is uncharged
	 *
	 * - the page is off-LRU
	 *
	 * - an anonymous fault has exclusive page access, except for
	 *   a locked page table
	 *
	 * - a page cache insertion, a swapin fault, or a migration
	 *   have the page locked
	 */
	page->mem_cgroup = memcg;

	if (lrucare)
		unlock_page_lru(page, isolated);
}

#ifdef CONFIG_MEMCG_KMEM
int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
		      unsigned long nr_pages)
{
	struct page_counter *counter;
	int ret = 0;

	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
	if (ret < 0)
		return ret;

	ret = try_charge(memcg, gfp, nr_pages);
	if (ret == -EINTR)  {
		/*
		 * try_charge() chose to bypass to root due to OOM kill or
		 * fatal signal.  Since our only options are to either fail
		 * the allocation or charge it to this cgroup, do it as a
		 * temporary condition. But we can't fail. From a kmem/slab
		 * perspective, the cache has already been selected, by
		 * mem_cgroup_kmem_get_cache(), so it is too late to change
		 * our minds.
		 *
		 * This condition will only trigger if the task entered
		 * memcg_charge_kmem in a sane state, but was OOM-killed
		 * during try_charge() above. Tasks that were already dying
		 * when the allocation triggers should have been already
		 * directed to the root cgroup in memcontrol.h
		 */
		page_counter_charge(&memcg->memory, nr_pages);
		if (do_swap_account)
			page_counter_charge(&memcg->memsw, nr_pages);
		css_get_many(&memcg->css, nr_pages);
		ret = 0;
	} else if (ret)
		page_counter_uncharge(&memcg->kmem, nr_pages);

	return ret;
}

void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
{
	page_counter_uncharge(&memcg->memory, nr_pages);
	if (do_swap_account)
		page_counter_uncharge(&memcg->memsw, nr_pages);

	page_counter_uncharge(&memcg->kmem, nr_pages);

	css_put_many(&memcg->css, nr_pages);
}

/*
 * helper for acessing a memcg's index. It will be used as an index in the
 * child cache array in kmem_cache, and also to derive its name. This function
 * will return -1 when this is not a kmem-limited memcg.
 */
int memcg_cache_id(struct mem_cgroup *memcg)
{
	return memcg ? memcg->kmemcg_id : -1;
}

static int memcg_alloc_cache_id(void)
{
	int id, size;
	int err;

	id = ida_simple_get(&memcg_cache_ida,
			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
	if (id < 0)
		return id;

	if (id < memcg_nr_cache_ids)
		return id;

	/*
	 * There's no space for the new id in memcg_caches arrays,
	 * so we have to grow them.
	 */
	down_write(&memcg_cache_ids_sem);

	size = 2 * (id + 1);
	if (size < MEMCG_CACHES_MIN_SIZE)
		size = MEMCG_CACHES_MIN_SIZE;
	else if (size > MEMCG_CACHES_MAX_SIZE)
		size = MEMCG_CACHES_MAX_SIZE;

	err = memcg_update_all_caches(size);
	if (!err)
		err = memcg_update_all_list_lrus(size);
	if (!err)
		memcg_nr_cache_ids = size;

	up_write(&memcg_cache_ids_sem);

	if (err) {
		ida_simple_remove(&memcg_cache_ida, id);
		return err;
	}
	return id;
}

static void memcg_free_cache_id(int id)
{
	ida_simple_remove(&memcg_cache_ida, id);
}

struct memcg_kmem_cache_create_work {
	struct mem_cgroup *memcg;
	struct kmem_cache *cachep;
	struct work_struct work;
};

static void memcg_kmem_cache_create_func(struct work_struct *w)
{
	struct memcg_kmem_cache_create_work *cw =
		container_of(w, struct memcg_kmem_cache_create_work, work);
	struct mem_cgroup *memcg = cw->memcg;
	struct kmem_cache *cachep = cw->cachep;

	memcg_create_kmem_cache(memcg, cachep);

	css_put(&memcg->css);
	kfree(cw);
}

/*
 * Enqueue the creation of a per-memcg kmem_cache.
 */
static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
					       struct kmem_cache *cachep)
{
	struct memcg_kmem_cache_create_work *cw;

	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
	if (!cw)
		return;

	css_get(&memcg->css);

	cw->memcg = memcg;
	cw->cachep = cachep;
	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);

	schedule_work(&cw->work);
}

static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
					     struct kmem_cache *cachep)
{
	/*
	 * We need to stop accounting when we kmalloc, because if the
	 * corresponding kmalloc cache is not yet created, the first allocation
	 * in __memcg_schedule_kmem_cache_create will recurse.
	 *
	 * However, it is better to enclose the whole function. Depending on
	 * the debugging options enabled, INIT_WORK(), for instance, can
	 * trigger an allocation. This too, will make us recurse. Because at
	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
	 * the safest choice is to do it like this, wrapping the whole function.
	 */
	current->memcg_kmem_skip_account = 1;
	__memcg_schedule_kmem_cache_create(memcg, cachep);
	current->memcg_kmem_skip_account = 0;
}

/*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
 *
 * If the cache does not exist yet, if we are the first user of it,
 * we either create it immediately, if possible, or create it asynchronously
 * in a workqueue.
 * In the latter case, we will let the current allocation go through with
 * the original cache.
 *
 * Can't be called in interrupt context or from kernel threads.
 * This function needs to be called with rcu_read_lock() held.
 */
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
	struct mem_cgroup *memcg;
	struct kmem_cache *memcg_cachep;
	int kmemcg_id;

	VM_BUG_ON(!is_root_cache(cachep));

	if (current->memcg_kmem_skip_account)
		return cachep;

	memcg = get_mem_cgroup_from_mm(current->mm);
	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
	if (kmemcg_id < 0)
		goto out;

	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
	if (likely(memcg_cachep))
		return memcg_cachep;

	/*
	 * If we are in a safe context (can wait, and not in interrupt
	 * context), we could be be predictable and return right away.
	 * This would guarantee that the allocation being performed
	 * already belongs in the new cache.
	 *
	 * However, there are some clashes that can arrive from locking.
	 * For instance, because we acquire the slab_mutex while doing
	 * memcg_create_kmem_cache, this means no further allocation
	 * could happen with the slab_mutex held. So it's better to
	 * defer everything.
	 */
	memcg_schedule_kmem_cache_create(memcg, cachep);
out:
	css_put(&memcg->css);
	return cachep;
}

void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
	if (!is_root_cache(cachep))
		css_put(&cachep->memcg_params.memcg->css);
}

/*
 * We need to verify if the allocation against current->mm->owner's memcg is
 * possible for the given order. But the page is not allocated yet, so we'll
 * need a further commit step to do the final arrangements.
 *
 * It is possible for the task to switch cgroups in this mean time, so at
 * commit time, we can't rely on task conversion any longer.  We'll then use
 * the handle argument to return to the caller which cgroup we should commit
 * against. We could also return the memcg directly and avoid the pointer
 * passing, but a boolean return value gives better semantics considering
 * the compiled-out case as well.
 *
 * Returning true means the allocation is possible.
 */
bool
__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
{
	struct mem_cgroup *memcg;
	int ret;

	*_memcg = NULL;

	memcg = get_mem_cgroup_from_mm(current->mm);

	if (!memcg_kmem_is_active(memcg)) {
		css_put(&memcg->css);
		return true;
	}

	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
	if (!ret)
		*_memcg = memcg;

	css_put(&memcg->css);
	return (ret == 0);
}

void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
			      int order)
{
	VM_BUG_ON(mem_cgroup_is_root(memcg));

	/* The page allocation failed. Revert */
	if (!page) {
		memcg_uncharge_kmem(memcg, 1 << order);
		return;
	}
	page->mem_cgroup = memcg;
}

void __memcg_kmem_uncharge_pages(struct page *page, int order)
{
	struct mem_cgroup *memcg = page->mem_cgroup;

	if (!memcg)
		return;

	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);

	memcg_uncharge_kmem(memcg, 1 << order);
	page->mem_cgroup = NULL;
}

struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
{
	struct mem_cgroup *memcg = NULL;
	struct kmem_cache *cachep;
	struct page *page;

	page = virt_to_head_page(ptr);
	if (PageSlab(page)) {
		cachep = page->slab_cache;
		if (!is_root_cache(cachep))
			memcg = cachep->memcg_params.memcg;
	} else
		/* page allocated by alloc_kmem_pages */
		memcg = page->mem_cgroup;

	return memcg;
}
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

/*
 * Because tail pages are not marked as "used", set it. We're under
 * zone->lru_lock, 'splitting on pmd' and compound_lock.
 * charge/uncharge will be never happen and move_account() is done under
 * compound_lock(), so we don't have to take care of races.
 */
void mem_cgroup_split_huge_fixup(struct page *head)
{
	int i;

	if (mem_cgroup_disabled())
		return;

	for (i = 1; i < HPAGE_PMD_NR; i++)
		head[i].mem_cgroup = head->mem_cgroup;

	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
		       HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
					 bool charge)
{
	int val = (charge) ? 1 : -1;
	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
}

/**
 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
 * @entry: swap entry to be moved
 * @from:  mem_cgroup which the entry is moved from
 * @to:  mem_cgroup which the entry is moved to
 *
 * It succeeds only when the swap_cgroup's record for this entry is the same
 * as the mem_cgroup's id of @from.
 *
 * Returns 0 on success, -EINVAL on failure.
 *
 * The caller must have charged to @to, IOW, called page_counter_charge() about
 * both res and memsw, and called css_get().
 */
static int mem_cgroup_move_swap_account(swp_entry_t entry,
				struct mem_cgroup *from, struct mem_cgroup *to)
{
	unsigned short old_id, new_id;

	old_id = mem_cgroup_id(from);
	new_id = mem_cgroup_id(to);

	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
		mem_cgroup_swap_statistics(from, false);
		mem_cgroup_swap_statistics(to, true);
		return 0;
	}
	return -EINVAL;
}
#else
static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
				struct mem_cgroup *from, struct mem_cgroup *to)
{
	return -EINVAL;
}
#endif

static DEFINE_MUTEX(memcg_limit_mutex);

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
				   unsigned long limit)
{
	unsigned long curusage;
	unsigned long oldusage;
	bool enlarge = false;
	int retry_count;
	int ret;

	/*
	 * For keeping hierarchical_reclaim simple, how long we should retry
	 * is depends on callers. We set our retry-count to be function
	 * of # of children which we should visit in this loop.
	 */
	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
		      mem_cgroup_count_children(memcg);

	oldusage = page_counter_read(&memcg->memory);

	do {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}

		mutex_lock(&memcg_limit_mutex);
		if (limit > memcg->memsw.limit) {
			mutex_unlock(&memcg_limit_mutex);
			ret = -EINVAL;
			break;
		}
		if (limit > memcg->memory.limit)
			enlarge = true;
		ret = page_counter_limit(&memcg->memory, limit);
		mutex_unlock(&memcg_limit_mutex);

		if (!ret)
			break;

		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);

		curusage = page_counter_read(&memcg->memory);
		/* Usage is reduced ? */
		if (curusage >= oldusage)
			retry_count--;
		else
			oldusage = curusage;
	} while (retry_count);

	if (!ret && enlarge)
		memcg_oom_recover(memcg);

	return ret;
}

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
					 unsigned long limit)
{
	unsigned long curusage;
	unsigned long oldusage;
	bool enlarge = false;
	int retry_count;
	int ret;

	/* see mem_cgroup_resize_res_limit */
	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
		      mem_cgroup_count_children(memcg);

	oldusage = page_counter_read(&memcg->memsw);

	do {
		if (signal_pending(current)) {
			ret = -EINTR;
			break;
		}

		mutex_lock(&memcg_limit_mutex);
		if (limit < memcg->memory.limit) {
			mutex_unlock(&memcg_limit_mutex);
			ret = -EINVAL;
			break;
		}
		if (limit > memcg->memsw.limit)
			enlarge = true;
		ret = page_counter_limit(&memcg->memsw, limit);
		mutex_unlock(&memcg_limit_mutex);

		if (!ret)
			break;

		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);

		curusage = page_counter_read(&memcg->memsw);
		/* Usage is reduced ? */
		if (curusage >= oldusage)
			retry_count--;
		else
			oldusage = curusage;
	} while (retry_count);

	if (!ret && enlarge)
		memcg_oom_recover(memcg);

	return ret;
}

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
					    gfp_t gfp_mask,
					    unsigned long *total_scanned)
{
	unsigned long nr_reclaimed = 0;
	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
	unsigned long reclaimed;
	int loop = 0;
	struct mem_cgroup_tree_per_zone *mctz;
	unsigned long excess;
	unsigned long nr_scanned;

	if (order > 0)
		return 0;

	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
	/*
	 * This loop can run a while, specially if mem_cgroup's continuously
	 * keep exceeding their soft limit and putting the system under
	 * pressure
	 */
	do {
		if (next_mz)
			mz = next_mz;
		else
			mz = mem_cgroup_largest_soft_limit_node(mctz);
		if (!mz)
			break;

		nr_scanned = 0;
		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
						    gfp_mask, &nr_scanned);
		nr_reclaimed += reclaimed;
		*total_scanned += nr_scanned;
		spin_lock_irq(&mctz->lock);
		__mem_cgroup_remove_exceeded(mz, mctz);

		/*
		 * If we failed to reclaim anything from this memory cgroup
		 * it is time to move on to the next cgroup
		 */
		next_mz = NULL;
		if (!reclaimed)
			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);

		excess = soft_limit_excess(mz->memcg);
		/*
		 * One school of thought says that we should not add
		 * back the node to the tree if reclaim returns 0.
		 * But our reclaim could return 0, simply because due
		 * to priority we are exposing a smaller subset of
		 * memory to reclaim from. Consider this as a longer
		 * term TODO.
		 */
		/* If excess == 0, no tree ops */
		__mem_cgroup_insert_exceeded(mz, mctz, excess);
		spin_unlock_irq(&mctz->lock);
		css_put(&mz->memcg->css);
		loop++;
		/*
		 * Could not reclaim anything and there are no more
		 * mem cgroups to try or we seem to be looping without
		 * reclaiming anything.
		 */
		if (!nr_reclaimed &&
			(next_mz == NULL ||
			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
			break;
	} while (!nr_reclaimed);
	if (next_mz)
		css_put(&next_mz->memcg->css);
	return nr_reclaimed;
}

/*
 * Test whether @memcg has children, dead or alive.  Note that this
 * function doesn't care whether @memcg has use_hierarchy enabled and
 * returns %true if there are child csses according to the cgroup
 * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
 */
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
	bool ret;

	/*
	 * The lock does not prevent addition or deletion of children, but
	 * it prevents a new child from being initialized based on this
	 * parent in css_online(), so it's enough to decide whether
	 * hierarchically inherited attributes can still be changed or not.
	 */
	lockdep_assert_held(&memcg_create_mutex);

	rcu_read_lock();
	ret = css_next_child(NULL, &memcg->css);
	rcu_read_unlock();
	return ret;
}

/*
 * Reclaims as many pages from the given memcg as possible and moves
 * the rest to the parent.
 *
 * Caller is responsible for holding css reference for memcg.
 */
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

	/* we call try-to-free pages for make this cgroup empty */
	lru_add_drain_all();
	/* try to free all pages in this cgroup */
	while (nr_retries && page_counter_read(&memcg->memory)) {
		int progress;

		if (signal_pending(current))
			return -EINTR;

		progress = try_to_free_mem_cgroup_pages(memcg, 1,
							GFP_KERNEL, true);
		if (!progress) {
			nr_retries--;
			/* maybe some writeback is necessary */
			congestion_wait(BLK_RW_ASYNC, HZ/10);
		}

	}

	return 0;
}

static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
					    char *buf, size_t nbytes,
					    loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

	if (mem_cgroup_is_root(memcg))
		return -EINVAL;
	return mem_cgroup_force_empty(memcg) ?: nbytes;
}

static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
				     struct cftype *cft)
{
	return mem_cgroup_from_css(css)->use_hierarchy;
}

static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
				      struct cftype *cft, u64 val)
{
	int retval = 0;
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);

	mutex_lock(&memcg_create_mutex);

	if (memcg->use_hierarchy == val)
		goto out;

	/*
	 * If parent's use_hierarchy is set, we can't make any modifications
	 * in the child subtrees. If it is unset, then the change can
	 * occur, provided the current cgroup has no children.
	 *
	 * For the root cgroup, parent_mem is NULL, we allow value to be
	 * set if there are no children.
	 */
	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
				(val == 1 || val == 0)) {
		if (!memcg_has_children(memcg))
			memcg->use_hierarchy = val;
		else
			retval = -EBUSY;
	} else
		retval = -EINVAL;

out:
	mutex_unlock(&memcg_create_mutex);

	return retval;
}

static unsigned long tree_stat(struct mem_cgroup *memcg,
			       enum mem_cgroup_stat_index idx)
{
	struct mem_cgroup *iter;
	long val = 0;

	/* Per-cpu values can be negative, use a signed accumulator */
	for_each_mem_cgroup_tree(iter, memcg)
		val += mem_cgroup_read_stat(iter, idx);

	if (val < 0) /* race ? */
		val = 0;
	return val;
}

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
	u64 val;

	if (mem_cgroup_is_root(memcg)) {
		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
		if (swap)
			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
	} else {
		if (!swap)
			val = page_counter_read(&memcg->memory);
		else
			val = page_counter_read(&memcg->memsw);
	}
	return val << PAGE_SHIFT;
}

enum {
	RES_USAGE,
	RES_LIMIT,
	RES_MAX_USAGE,
	RES_FAILCNT,
	RES_SOFT_LIMIT,
};

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
			       struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct page_counter *counter;

	switch (MEMFILE_TYPE(cft->private)) {
	case _MEM:
		counter = &memcg->memory;
		break;
	case _MEMSWAP:
		counter = &memcg->memsw;
		break;
	case _KMEM:
		counter = &memcg->kmem;
		break;
	default:
		BUG();
	}

	switch (MEMFILE_ATTR(cft->private)) {
	case RES_USAGE:
		if (counter == &memcg->memory)
			return mem_cgroup_usage(memcg, false);
		if (counter == &memcg->memsw)
			return mem_cgroup_usage(memcg, true);
		return (u64)page_counter_read(counter) * PAGE_SIZE;
	case RES_LIMIT:
		return (u64)counter->limit * PAGE_SIZE;
	case RES_MAX_USAGE:
		return (u64)counter->watermark * PAGE_SIZE;
	case RES_FAILCNT:
		return counter->failcnt;
	case RES_SOFT_LIMIT:
		return (u64)memcg->soft_limit * PAGE_SIZE;
	default:
		BUG();
	}
}

#ifdef CONFIG_MEMCG_KMEM
static int memcg_activate_kmem(struct mem_cgroup *memcg,
			       unsigned long nr_pages)
{
	int err = 0;
	int memcg_id;

	BUG_ON(memcg->kmemcg_id >= 0);
	BUG_ON(memcg->kmem_acct_activated);
	BUG_ON(memcg->kmem_acct_active);

	/*
	 * For simplicity, we won't allow this to be disabled.  It also can't
	 * be changed if the cgroup has children already, or if tasks had
	 * already joined.
	 *
	 * If tasks join before we set the limit, a person looking at
	 * kmem.usage_in_bytes will have no way to determine when it took
	 * place, which makes the value quite meaningless.
	 *
	 * After it first became limited, changes in the value of the limit are
	 * of course permitted.
	 */
	mutex_lock(&memcg_create_mutex);
	if (cgroup_has_tasks(memcg->css.cgroup) ||
	    (memcg->use_hierarchy && memcg_has_children(memcg)))
		err = -EBUSY;
	mutex_unlock(&memcg_create_mutex);
	if (err)
		goto out;

	memcg_id = memcg_alloc_cache_id();
	if (memcg_id < 0) {
		err = memcg_id;
		goto out;
	}

	/*
	 * We couldn't have accounted to this cgroup, because it hasn't got
	 * activated yet, so this should succeed.
	 */
	err = page_counter_limit(&memcg->kmem, nr_pages);
	VM_BUG_ON(err);

	static_key_slow_inc(&memcg_kmem_enabled_key);
	/*
	 * A memory cgroup is considered kmem-active as soon as it gets
	 * kmemcg_id. Setting the id after enabling static branching will
	 * guarantee no one starts accounting before all call sites are
	 * patched.
	 */
	memcg->kmemcg_id = memcg_id;
	memcg->kmem_acct_activated = true;
	memcg->kmem_acct_active = true;
out:
	return err;
}

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
				   unsigned long limit)
{
	int ret;

	mutex_lock(&memcg_limit_mutex);
	if (!memcg_kmem_is_active(memcg))
		ret = memcg_activate_kmem(memcg, limit);
	else
		ret = page_counter_limit(&memcg->kmem, limit);
	mutex_unlock(&memcg_limit_mutex);
	return ret;
}

static int memcg_propagate_kmem(struct mem_cgroup *memcg)
{
	int ret = 0;
	struct mem_cgroup *parent = parent_mem_cgroup(memcg);

	if (!parent)
		return 0;

	mutex_lock(&memcg_limit_mutex);
	/*
	 * If the parent cgroup is not kmem-active now, it cannot be activated
	 * after this point, because it has at least one child already.
	 */
	if (memcg_kmem_is_active(parent))
		ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
	mutex_unlock(&memcg_limit_mutex);
	return ret;
}
#else
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
				   unsigned long limit)
{
	return -EINVAL;
}
#endif /* CONFIG_MEMCG_KMEM */

/*
 * The user of this function is...
 * RES_LIMIT.
 */
static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long nr_pages;
	int ret;

	buf = strstrip(buf);
	ret = page_counter_memparse(buf, "-1", &nr_pages);
	if (ret)
		return ret;

	switch (MEMFILE_ATTR(of_cft(of)->private)) {
	case RES_LIMIT:
		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
			ret = -EINVAL;
			break;
		}
		switch (MEMFILE_TYPE(of_cft(of)->private)) {
		case _MEM:
			ret = mem_cgroup_resize_limit(memcg, nr_pages);
			break;
		case _MEMSWAP:
			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
			break;
		case _KMEM:
			ret = memcg_update_kmem_limit(memcg, nr_pages);
			break;
		}
		break;
	case RES_SOFT_LIMIT:
		memcg->soft_limit = nr_pages;
		ret = 0;
		break;
	}
	return ret ?: nbytes;
}

static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
				size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	struct page_counter *counter;

	switch (MEMFILE_TYPE(of_cft(of)->private)) {
	case _MEM:
		counter = &memcg->memory;
		break;
	case _MEMSWAP:
		counter = &memcg->memsw;
		break;
	case _KMEM:
		counter = &memcg->kmem;
		break;
	default:
		BUG();
	}

	switch (MEMFILE_ATTR(of_cft(of)->private)) {
	case RES_MAX_USAGE:
		page_counter_reset_watermark(counter);
		break;
	case RES_FAILCNT:
		counter->failcnt = 0;
		break;
	default:
		BUG();
	}

	return nbytes;
}

static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
					struct cftype *cft)
{
	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
}

#ifdef CONFIG_MMU
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
					struct cftype *cft, u64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	if (val & ~MOVE_MASK)
		return -EINVAL;

	/*
	 * No kind of locking is needed in here, because ->can_attach() will
	 * check this value once in the beginning of the process, and then carry
	 * on with stale data. This means that changes to this value will only
	 * affect task migrations starting after the change.
	 */
	memcg->move_charge_at_immigrate = val;
	return 0;
}
#else
static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
					struct cftype *cft, u64 val)
{
	return -ENOSYS;
}
#endif

#ifdef CONFIG_NUMA
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
	struct numa_stat {
		const char *name;
		unsigned int lru_mask;
	};

	static const struct numa_stat stats[] = {
		{ "total", LRU_ALL },
		{ "file", LRU_ALL_FILE },
		{ "anon", LRU_ALL_ANON },
		{ "unevictable", BIT(LRU_UNEVICTABLE) },
	};
	const struct numa_stat *stat;
	int nid;
	unsigned long nr;
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
		seq_printf(m, "%s=%lu", stat->name, nr);
		for_each_node_state(nid, N_MEMORY) {
			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
							  stat->lru_mask);
			seq_printf(m, " N%d=%lu", nid, nr);
		}
		seq_putc(m, '\n');
	}

	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
		struct mem_cgroup *iter;

		nr = 0;
		for_each_mem_cgroup_tree(iter, memcg)
			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
		for_each_node_state(nid, N_MEMORY) {
			nr = 0;
			for_each_mem_cgroup_tree(iter, memcg)
				nr += mem_cgroup_node_nr_lru_pages(
					iter, nid, stat->lru_mask);
			seq_printf(m, " N%d=%lu", nid, nr);
		}
		seq_putc(m, '\n');
	}

	return 0;
}
#endif /* CONFIG_NUMA */

static int memcg_stat_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	unsigned long memory, memsw;
	struct mem_cgroup *mi;
	unsigned int i;

	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
		     MEM_CGROUP_STAT_NSTATS);
	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
		     MEM_CGROUP_EVENTS_NSTATS);
	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
			continue;
		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
	}

	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
			   mem_cgroup_read_events(memcg, i));

	for (i = 0; i < NR_LRU_LISTS; i++)
		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

	/* Hierarchical information */
	memory = memsw = PAGE_COUNTER_MAX;
	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
		memory = min(memory, mi->memory.limit);
		memsw = min(memsw, mi->memsw.limit);
	}
	seq_printf(m, "hierarchical_memory_limit %llu\n",
		   (u64)memory * PAGE_SIZE);
	if (do_swap_account)
		seq_printf(m, "hierarchical_memsw_limit %llu\n",
			   (u64)memsw * PAGE_SIZE);

	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
		long long val = 0;

		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
			continue;
		for_each_mem_cgroup_tree(mi, memcg)
			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
	}

	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
		unsigned long long val = 0;

		for_each_mem_cgroup_tree(mi, memcg)
			val += mem_cgroup_read_events(mi, i);
		seq_printf(m, "total_%s %llu\n",
			   mem_cgroup_events_names[i], val);
	}

	for (i = 0; i < NR_LRU_LISTS; i++) {
		unsigned long long val = 0;

		for_each_mem_cgroup_tree(mi, memcg)
			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
	}

#ifdef CONFIG_DEBUG_VM
	{
		int nid, zid;
		struct mem_cgroup_per_zone *mz;
		struct zone_reclaim_stat *rstat;
		unsigned long recent_rotated[2] = {0, 0};
		unsigned long recent_scanned[2] = {0, 0};

		for_each_online_node(nid)
			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
				rstat = &mz->lruvec.reclaim_stat;

				recent_rotated[0] += rstat->recent_rotated[0];
				recent_rotated[1] += rstat->recent_rotated[1];
				recent_scanned[0] += rstat->recent_scanned[0];
				recent_scanned[1] += rstat->recent_scanned[1];
			}
		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
	}
#endif

	return 0;
}

static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	return mem_cgroup_swappiness(memcg);
}

static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, u64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	if (val > 100)
		return -EINVAL;

	if (css->parent)
		memcg->swappiness = val;
	else
		vm_swappiness = val;

	return 0;
}

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
	struct mem_cgroup_threshold_ary *t;
	unsigned long usage;
	int i;

	rcu_read_lock();
	if (!swap)
		t = rcu_dereference(memcg->thresholds.primary);
	else
		t = rcu_dereference(memcg->memsw_thresholds.primary);

	if (!t)
		goto unlock;

	usage = mem_cgroup_usage(memcg, swap);

	/*
	 * current_threshold points to threshold just below or equal to usage.
	 * If it's not true, a threshold was crossed after last
	 * call of __mem_cgroup_threshold().
	 */
	i = t->current_threshold;

	/*
	 * Iterate backward over array of thresholds starting from
	 * current_threshold and check if a threshold is crossed.
	 * If none of thresholds below usage is crossed, we read
	 * only one element of the array here.
	 */
	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
		eventfd_signal(t->entries[i].eventfd, 1);

	/* i = current_threshold + 1 */
	i++;

	/*
	 * Iterate forward over array of thresholds starting from
	 * current_threshold+1 and check if a threshold is crossed.
	 * If none of thresholds above usage is crossed, we read
	 * only one element of the array here.
	 */
	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
		eventfd_signal(t->entries[i].eventfd, 1);

	/* Update current_threshold */
	t->current_threshold = i - 1;
unlock:
	rcu_read_unlock();
}

static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
	while (memcg) {
		__mem_cgroup_threshold(memcg, false);
		if (do_swap_account)
			__mem_cgroup_threshold(memcg, true);

		memcg = parent_mem_cgroup(memcg);
	}
}

static int compare_thresholds(const void *a, const void *b)
{
	const struct mem_cgroup_threshold *_a = a;
	const struct mem_cgroup_threshold *_b = b;

	if (_a->threshold > _b->threshold)
		return 1;

	if (_a->threshold < _b->threshold)
		return -1;

	return 0;
}

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
{
	struct mem_cgroup_eventfd_list *ev;

	spin_lock(&memcg_oom_lock);

	list_for_each_entry(ev, &memcg->oom_notify, list)
		eventfd_signal(ev->eventfd, 1);

	spin_unlock(&memcg_oom_lock);
	return 0;
}

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
{
	struct mem_cgroup *iter;

	for_each_mem_cgroup_tree(iter, memcg)
		mem_cgroup_oom_notify_cb(iter);
}

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
{
	struct mem_cgroup_thresholds *thresholds;
	struct mem_cgroup_threshold_ary *new;
	unsigned long threshold;
	unsigned long usage;
	int i, size, ret;

	ret = page_counter_memparse(args, "-1", &threshold);
	if (ret)
		return ret;
	threshold <<= PAGE_SHIFT;

	mutex_lock(&memcg->thresholds_lock);

	if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = mem_cgroup_usage(memcg, false);
	} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = mem_cgroup_usage(memcg, true);
	} else
		BUG();

	/* Check if a threshold crossed before adding a new one */
	if (thresholds->primary)
		__mem_cgroup_threshold(memcg, type == _MEMSWAP);

	size = thresholds->primary ? thresholds->primary->size + 1 : 1;

	/* Allocate memory for new array of thresholds */
	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
			GFP_KERNEL);
	if (!new) {
		ret = -ENOMEM;
		goto unlock;
	}
	new->size = size;

	/* Copy thresholds (if any) to new array */
	if (thresholds->primary) {
		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
				sizeof(struct mem_cgroup_threshold));
	}

	/* Add new threshold */
	new->entries[size - 1].eventfd = eventfd;
	new->entries[size - 1].threshold = threshold;

	/* Sort thresholds. Registering of new threshold isn't time-critical */
	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
			compare_thresholds, NULL);

	/* Find current threshold */
	new->current_threshold = -1;
	for (i = 0; i < size; i++) {
		if (new->entries[i].threshold <= usage) {
			/*
			 * new->current_threshold will not be used until
			 * rcu_assign_pointer(), so it's safe to increment
			 * it here.
			 */
			++new->current_threshold;
		} else
			break;
	}

	/* Free old spare buffer and save old primary buffer as spare */
	kfree(thresholds->spare);
	thresholds->spare = thresholds->primary;

	rcu_assign_pointer(thresholds->primary, new);

	/* To be sure that nobody uses thresholds */
	synchronize_rcu();

unlock:
	mutex_unlock(&memcg->thresholds_lock);

	return ret;
}

static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd, const char *args)
{
	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
}

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd, const char *args)
{
	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
}

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd, enum res_type type)
{
	struct mem_cgroup_thresholds *thresholds;
	struct mem_cgroup_threshold_ary *new;
	unsigned long usage;
	int i, j, size;

	mutex_lock(&memcg->thresholds_lock);

	if (type == _MEM) {
		thresholds = &memcg->thresholds;
		usage = mem_cgroup_usage(memcg, false);
	} else if (type == _MEMSWAP) {
		thresholds = &memcg->memsw_thresholds;
		usage = mem_cgroup_usage(memcg, true);
	} else
		BUG();

	if (!thresholds->primary)
		goto unlock;

	/* Check if a threshold crossed before removing */
	__mem_cgroup_threshold(memcg, type == _MEMSWAP);

	/* Calculate new number of threshold */
	size = 0;
	for (i = 0; i < thresholds->primary->size; i++) {
		if (thresholds->primary->entries[i].eventfd != eventfd)
			size++;
	}

	new = thresholds->spare;

	/* Set thresholds array to NULL if we don't have thresholds */
	if (!size) {
		kfree(new);
		new = NULL;
		goto swap_buffers;
	}

	new->size = size;

	/* Copy thresholds and find current threshold */
	new->current_threshold = -1;
	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
		if (thresholds->primary->entries[i].eventfd == eventfd)
			continue;

		new->entries[j] = thresholds->primary->entries[i];
		if (new->entries[j].threshold <= usage) {
			/*
			 * new->current_threshold will not be used
			 * until rcu_assign_pointer(), so it's safe to increment
			 * it here.
			 */
			++new->current_threshold;
		}
		j++;
	}

swap_buffers:
	/* Swap primary and spare array */
	thresholds->spare = thresholds->primary;
	/* If all events are unregistered, free the spare array */
	if (!new) {
		kfree(thresholds->spare);
		thresholds->spare = NULL;
	}

	rcu_assign_pointer(thresholds->primary, new);

	/* To be sure that nobody uses thresholds */
	synchronize_rcu();
unlock:
	mutex_unlock(&memcg->thresholds_lock);
}

static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd)
{
	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
}

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd)
{
	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
}

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd, const char *args)
{
	struct mem_cgroup_eventfd_list *event;

	event = kmalloc(sizeof(*event),	GFP_KERNEL);
	if (!event)
		return -ENOMEM;

	spin_lock(&memcg_oom_lock);

	event->eventfd = eventfd;
	list_add(&event->list, &memcg->oom_notify);

	/* already in OOM ? */
	if (atomic_read(&memcg->under_oom))
		eventfd_signal(eventfd, 1);
	spin_unlock(&memcg_oom_lock);

	return 0;
}

static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
	struct eventfd_ctx *eventfd)
{
	struct mem_cgroup_eventfd_list *ev, *tmp;

	spin_lock(&memcg_oom_lock);

	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
		if (ev->eventfd == eventfd) {
			list_del(&ev->list);
			kfree(ev);
		}
	}

	spin_unlock(&memcg_oom_lock);
}

static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
	return 0;
}

static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
	struct cftype *cft, u64 val)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	/* cannot set to root cgroup and only 0 and 1 are allowed */
	if (!css->parent || !((val == 0) || (val == 1)))
		return -EINVAL;

	memcg->oom_kill_disable = val;
	if (!val)
		memcg_oom_recover(memcg);

	return 0;
}

#ifdef CONFIG_MEMCG_KMEM
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
	int ret;

	ret = memcg_propagate_kmem(memcg);
	if (ret)
		return ret;

	return mem_cgroup_sockets_init(memcg, ss);
}

static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
{
	struct cgroup_subsys_state *css;
	struct mem_cgroup *parent, *child;
	int kmemcg_id;

	if (!memcg->kmem_acct_active)
		return;

	/*
	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
	 * guarantees no cache will be created for this cgroup after we are
	 * done (see memcg_create_kmem_cache()).
	 */
	memcg->kmem_acct_active = false;

	memcg_deactivate_kmem_caches(memcg);

	kmemcg_id = memcg->kmemcg_id;
	BUG_ON(kmemcg_id < 0);

	parent = parent_mem_cgroup(memcg);
	if (!parent)
		parent = root_mem_cgroup;

	/*
	 * Change kmemcg_id of this cgroup and all its descendants to the
	 * parent's id, and then move all entries from this cgroup's list_lrus
	 * to ones of the parent. After we have finished, all list_lrus
	 * corresponding to this cgroup are guaranteed to remain empty. The
	 * ordering is imposed by list_lru_node->lock taken by
	 * memcg_drain_all_list_lrus().
	 */
	css_for_each_descendant_pre(css, &memcg->css) {
		child = mem_cgroup_from_css(css);
		BUG_ON(child->kmemcg_id != kmemcg_id);
		child->kmemcg_id = parent->kmemcg_id;
		if (!memcg->use_hierarchy)
			break;
	}
	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);

	memcg_free_cache_id(kmemcg_id);
}

static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
	if (memcg->kmem_acct_activated) {
		memcg_destroy_kmem_caches(memcg);
		static_key_slow_dec(&memcg_kmem_enabled_key);
		WARN_ON(page_counter_read(&memcg->kmem));
	}
	mem_cgroup_sockets_destroy(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
	return 0;
}

static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
{
}

static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
#endif

/*
 * DO NOT USE IN NEW FILES.
 *
 * "cgroup.event_control" implementation.
 *
 * This is way over-engineered.  It tries to support fully configurable
 * events for each user.  Such level of flexibility is completely
 * unnecessary especially in the light of the planned unified hierarchy.
 *
 * Please deprecate this and replace with something simpler if at all
 * possible.
 */

/*
 * Unregister event and free resources.
 *
 * Gets called from workqueue.
 */
static void memcg_event_remove(struct work_struct *work)
{
	struct mem_cgroup_event *event =
		container_of(work, struct mem_cgroup_event, remove);
	struct mem_cgroup *memcg = event->memcg;

	remove_wait_queue(event->wqh, &event->wait);

	event->unregister_event(memcg, event->eventfd);

	/* Notify userspace the event is going away. */
	eventfd_signal(event->eventfd, 1);

	eventfd_ctx_put(event->eventfd);
	kfree(event);
	css_put(&memcg->css);
}

/*
 * Gets called on POLLHUP on eventfd when user closes it.
 *
 * Called with wqh->lock held and interrupts disabled.
 */
static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
			    int sync, void *key)
{
	struct mem_cgroup_event *event =
		container_of(wait, struct mem_cgroup_event, wait);
	struct mem_cgroup *memcg = event->memcg;
	unsigned long flags = (unsigned long)key;

	if (flags & POLLHUP) {
		/*
		 * If the event has been detached at cgroup removal, we
		 * can simply return knowing the other side will cleanup
		 * for us.
		 *
		 * We can't race against event freeing since the other
		 * side will require wqh->lock via remove_wait_queue(),
		 * which we hold.
		 */
		spin_lock(&memcg->event_list_lock);
		if (!list_empty(&event->list)) {
			list_del_init(&event->list);
			/*
			 * We are in atomic context, but cgroup_event_remove()
			 * may sleep, so we have to call it in workqueue.
			 */
			schedule_work(&event->remove);
		}
		spin_unlock(&memcg->event_list_lock);
	}

	return 0;
}

static void memcg_event_ptable_queue_proc(struct file *file,
		wait_queue_head_t *wqh, poll_table *pt)
{
	struct mem_cgroup_event *event =
		container_of(pt, struct mem_cgroup_event, pt);

	event->wqh = wqh;
	add_wait_queue(wqh, &event->wait);
}

/*
 * DO NOT USE IN NEW FILES.
 *
 * Parse input and register new cgroup event handler.
 *
 * Input must be in format '<event_fd> <control_fd> <args>'.
 * Interpretation of args is defined by control file implementation.
 */
static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
					 char *buf, size_t nbytes, loff_t off)
{
	struct cgroup_subsys_state *css = of_css(of);
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup_event *event;
	struct cgroup_subsys_state *cfile_css;
	unsigned int efd, cfd;
	struct fd efile;
	struct fd cfile;
	const char *name;
	char *endp;
	int ret;

	buf = strstrip(buf);

	efd = simple_strtoul(buf, &endp, 10);
	if (*endp != ' ')
		return -EINVAL;
	buf = endp + 1;

	cfd = simple_strtoul(buf, &endp, 10);
	if ((*endp != ' ') && (*endp != '\0'))
		return -EINVAL;
	buf = endp + 1;

	event = kzalloc(sizeof(*event), GFP_KERNEL);
	if (!event)
		return -ENOMEM;

	event->memcg = memcg;
	INIT_LIST_HEAD(&event->list);
	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
	INIT_WORK(&event->remove, memcg_event_remove);

	efile = fdget(efd);
	if (!efile.file) {
		ret = -EBADF;
		goto out_kfree;
	}

	event->eventfd = eventfd_ctx_fileget(efile.file);
	if (IS_ERR(event->eventfd)) {
		ret = PTR_ERR(event->eventfd);
		goto out_put_efile;
	}

	cfile = fdget(cfd);
	if (!cfile.file) {
		ret = -EBADF;
		goto out_put_eventfd;
	}

	/* the process need read permission on control file */
	/* AV: shouldn't we check that it's been opened for read instead? */
	ret = inode_permission(file_inode(cfile.file), MAY_READ);
	if (ret < 0)
		goto out_put_cfile;

	/*
	 * Determine the event callbacks and set them in @event.  This used
	 * to be done via struct cftype but cgroup core no longer knows
	 * about these events.  The following is crude but the whole thing
	 * is for compatibility anyway.
	 *
	 * DO NOT ADD NEW FILES.
	 */
	name = cfile.file->f_path.dentry->d_name.name;

	if (!strcmp(name, "memory.usage_in_bytes")) {
		event->register_event = mem_cgroup_usage_register_event;
		event->unregister_event = mem_cgroup_usage_unregister_event;
	} else if (!strcmp(name, "memory.oom_control")) {
		event->register_event = mem_cgroup_oom_register_event;
		event->unregister_event = mem_cgroup_oom_unregister_event;
	} else if (!strcmp(name, "memory.pressure_level")) {
		event->register_event = vmpressure_register_event;
		event->unregister_event = vmpressure_unregister_event;
	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
		event->register_event = memsw_cgroup_usage_register_event;
		event->unregister_event = memsw_cgroup_usage_unregister_event;
	} else {
		ret = -EINVAL;
		goto out_put_cfile;
	}

	/*
	 * Verify @cfile should belong to @css.  Also, remaining events are
	 * automatically removed on cgroup destruction but the removal is
	 * asynchronous, so take an extra ref on @css.
	 */
	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
					       &memory_cgrp_subsys);
	ret = -EINVAL;
	if (IS_ERR(cfile_css))
		goto out_put_cfile;
	if (cfile_css != css) {
		css_put(cfile_css);
		goto out_put_cfile;
	}

	ret = event->register_event(memcg, event->eventfd, buf);
	if (ret)
		goto out_put_css;

	efile.file->f_op->poll(efile.file, &event->pt);

	spin_lock(&memcg->event_list_lock);
	list_add(&event->list, &memcg->event_list);
	spin_unlock(&memcg->event_list_lock);

	fdput(cfile);
	fdput(efile);

	return nbytes;

out_put_css:
	css_put(css);
out_put_cfile:
	fdput(cfile);
out_put_eventfd:
	eventfd_ctx_put(event->eventfd);
out_put_efile:
	fdput(efile);
out_kfree:
	kfree(event);

	return ret;
}

static struct cftype mem_cgroup_legacy_files[] = {
	{
		.name = "usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "soft_limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "failcnt",
		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "stat",
		.seq_show = memcg_stat_show,
	},
	{
		.name = "force_empty",
		.write = mem_cgroup_force_empty_write,
	},
	{
		.name = "use_hierarchy",
		.write_u64 = mem_cgroup_hierarchy_write,
		.read_u64 = mem_cgroup_hierarchy_read,
	},
	{
		.name = "cgroup.event_control",		/* XXX: for compat */
		.write = memcg_write_event_control,
		.flags = CFTYPE_NO_PREFIX,
		.mode = S_IWUGO,
	},
	{
		.name = "swappiness",
		.read_u64 = mem_cgroup_swappiness_read,
		.write_u64 = mem_cgroup_swappiness_write,
	},
	{
		.name = "move_charge_at_immigrate",
		.read_u64 = mem_cgroup_move_charge_read,
		.write_u64 = mem_cgroup_move_charge_write,
	},
	{
		.name = "oom_control",
		.seq_show = mem_cgroup_oom_control_read,
		.write_u64 = mem_cgroup_oom_control_write,
		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
	},
	{
		.name = "pressure_level",
	},
#ifdef CONFIG_NUMA
	{
		.name = "numa_stat",
		.seq_show = memcg_numa_stat_show,
	},
#endif
#ifdef CONFIG_MEMCG_KMEM
	{
		.name = "kmem.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "kmem.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "kmem.failcnt",
		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "kmem.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
#ifdef CONFIG_SLABINFO
	{
		.name = "kmem.slabinfo",
		.seq_start = slab_start,
		.seq_next = slab_next,
		.seq_stop = slab_stop,
		.seq_show = memcg_slab_show,
	},
#endif
#endif
	{ },	/* terminate */
};

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
	struct mem_cgroup_per_node *pn;
	struct mem_cgroup_per_zone *mz;
	int zone, tmp = node;
	/*
	 * This routine is called against possible nodes.
	 * But it's BUG to call kmalloc() against offline node.
	 *
	 * TODO: this routine can waste much memory for nodes which will
	 *       never be onlined. It's better to use memory hotplug callback
	 *       function.
	 */
	if (!node_state(node, N_NORMAL_MEMORY))
		tmp = -1;
	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
	if (!pn)
		return 1;

	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
		mz = &pn->zoneinfo[zone];
		lruvec_init(&mz->lruvec);
		mz->usage_in_excess = 0;
		mz->on_tree = false;
		mz->memcg = memcg;
	}
	memcg->nodeinfo[node] = pn;
	return 0;
}

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
	kfree(memcg->nodeinfo[node]);
}

static struct mem_cgroup *mem_cgroup_alloc(void)
{
	struct mem_cgroup *memcg;
	size_t size;

	size = sizeof(struct mem_cgroup);
	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);

	memcg = kzalloc(size, GFP_KERNEL);
	if (!memcg)
		return NULL;

	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
	if (!memcg->stat)
		goto out_free;
	spin_lock_init(&memcg->pcp_counter_lock);
	return memcg;

out_free:
	kfree(memcg);
	return NULL;
}

/*
 * At destroying mem_cgroup, references from swap_cgroup can remain.
 * (scanning all at force_empty is too costly...)
 *
 * Instead of clearing all references at force_empty, we remember
 * the number of reference from swap_cgroup and free mem_cgroup when
 * it goes down to 0.
 *
 * Removal of cgroup itself succeeds regardless of refs from swap.
 */

static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
	int node;

	mem_cgroup_remove_from_trees(memcg);

	for_each_node(node)
		free_mem_cgroup_per_zone_info(memcg, node);

	free_percpu(memcg->stat);
	kfree(memcg);
}

/*
 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
 */
struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
	if (!memcg->memory.parent)
		return NULL;
	return mem_cgroup_from_counter(memcg->memory.parent, memory);
}
EXPORT_SYMBOL(parent_mem_cgroup);

static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
	struct mem_cgroup *memcg;
	long error = -ENOMEM;
	int node;

	memcg = mem_cgroup_alloc();
	if (!memcg)
		return ERR_PTR(error);

	for_each_node(node)
		if (alloc_mem_cgroup_per_zone_info(memcg, node))
			goto free_out;

	/* root ? */
	if (parent_css == NULL) {
		root_mem_cgroup = memcg;
		page_counter_init(&memcg->memory, NULL);
		memcg->high = PAGE_COUNTER_MAX;
		memcg->soft_limit = PAGE_COUNTER_MAX;
		page_counter_init(&memcg->memsw, NULL);
		page_counter_init(&memcg->kmem, NULL);
	}

	memcg->last_scanned_node = MAX_NUMNODES;
	INIT_LIST_HEAD(&memcg->oom_notify);
	memcg->move_charge_at_immigrate = 0;
	mutex_init(&memcg->thresholds_lock);
	spin_lock_init(&memcg->move_lock);
	vmpressure_init(&memcg->vmpressure);
	INIT_LIST_HEAD(&memcg->event_list);
	spin_lock_init(&memcg->event_list_lock);
#ifdef CONFIG_MEMCG_KMEM
	memcg->kmemcg_id = -1;
#endif

	return &memcg->css;

free_out:
	__mem_cgroup_free(memcg);
	return ERR_PTR(error);
}

static int
mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
	int ret;

	if (css->id > MEM_CGROUP_ID_MAX)
		return -ENOSPC;

	if (!parent)
		return 0;

	mutex_lock(&memcg_create_mutex);

	memcg->use_hierarchy = parent->use_hierarchy;
	memcg->oom_kill_disable = parent->oom_kill_disable;
	memcg->swappiness = mem_cgroup_swappiness(parent);

	if (parent->use_hierarchy) {
		page_counter_init(&memcg->memory, &parent->memory);
		memcg->high = PAGE_COUNTER_MAX;
		memcg->soft_limit = PAGE_COUNTER_MAX;
		page_counter_init(&memcg->memsw, &parent->memsw);
		page_counter_init(&memcg->kmem, &parent->kmem);

		/*
		 * No need to take a reference to the parent because cgroup
		 * core guarantees its existence.
		 */
	} else {
		page_counter_init(&memcg->memory, NULL);
		memcg->high = PAGE_COUNTER_MAX;
		memcg->soft_limit = PAGE_COUNTER_MAX;
		page_counter_init(&memcg->memsw, NULL);
		page_counter_init(&memcg->kmem, NULL);
		/*
		 * Deeper hierachy with use_hierarchy == false doesn't make
		 * much sense so let cgroup subsystem know about this
		 * unfortunate state in our controller.
		 */
		if (parent != root_mem_cgroup)
			memory_cgrp_subsys.broken_hierarchy = true;
	}
	mutex_unlock(&memcg_create_mutex);

	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
	if (ret)
		return ret;

	/*
	 * Make sure the memcg is initialized: mem_cgroup_iter()
	 * orders reading memcg->initialized against its callers
	 * reading the memcg members.
	 */
	smp_store_release(&memcg->initialized, 1);

	return 0;
}

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	struct mem_cgroup_event *event, *tmp;

	/*
	 * Unregister events and notify userspace.
	 * Notify userspace about cgroup removing only after rmdir of cgroup
	 * directory to avoid race between userspace and kernelspace.
	 */
	spin_lock(&memcg->event_list_lock);
	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
		list_del_init(&event->list);
		schedule_work(&event->remove);
	}
	spin_unlock(&memcg->event_list_lock);

	vmpressure_cleanup(&memcg->vmpressure);

	memcg_deactivate_kmem(memcg);
}

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	memcg_destroy_kmem(memcg);
	__mem_cgroup_free(memcg);
}

/**
 * mem_cgroup_css_reset - reset the states of a mem_cgroup
 * @css: the target css
 *
 * Reset the states of the mem_cgroup associated with @css.  This is
 * invoked when the userland requests disabling on the default hierarchy
 * but the memcg is pinned through dependency.  The memcg should stop
 * applying policies and should revert to the vanilla state as it may be
 * made visible again.
 *
 * The current implementation only resets the essential configurations.
 * This needs to be expanded to cover all the visible parts.
 */
static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
	memcg->low = 0;
	memcg->high = PAGE_COUNTER_MAX;
	memcg->soft_limit = PAGE_COUNTER_MAX;
}

#ifdef CONFIG_MMU
/* Handlers for move charge at task migration. */
static int mem_cgroup_do_precharge(unsigned long count)
{
	int ret;

	/* Try a single bulk charge without reclaim first */
	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
	if (!ret) {
		mc.precharge += count;
		return ret;
	}
	if (ret == -EINTR) {
		cancel_charge(root_mem_cgroup, count);
		return ret;
	}

	/* Try charges one by one with reclaim */
	while (count--) {
		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
		/*
		 * In case of failure, any residual charges against
		 * mc.to will be dropped by mem_cgroup_clear_mc()
		 * later on.  However, cancel any charges that are
		 * bypassed to root right away or they'll be lost.
		 */
		if (ret == -EINTR)
			cancel_charge(root_mem_cgroup, 1);
		if (ret)
			return ret;
		mc.precharge++;
		cond_resched();
	}
	return 0;
}

/**
 * get_mctgt_type - get target type of moving charge
 * @vma: the vma the pte to be checked belongs
 * @addr: the address corresponding to the pte to be checked
 * @ptent: the pte to be checked
 * @target: the pointer the target page or swap ent will be stored(can be NULL)
 *
 * Returns
 *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
 *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
 *     move charge. if @target is not NULL, the page is stored in target->page
 *     with extra refcnt got(Callers should handle it).
 *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
 *     target for charge migration. if @target is not NULL, the entry is stored
 *     in target->ent.
 *
 * Called with pte lock held.
 */
union mc_target {
	struct page	*page;
	swp_entry_t	ent;
};

enum mc_target_type {
	MC_TARGET_NONE = 0,
	MC_TARGET_PAGE,
	MC_TARGET_SWAP,
};

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
						unsigned long addr, pte_t ptent)
{
	struct page *page = vm_normal_page(vma, addr, ptent);

	if (!page || !page_mapped(page))
		return NULL;
	if (PageAnon(page)) {
		if (!(mc.flags & MOVE_ANON))
			return NULL;
	} else {
		if (!(mc.flags & MOVE_FILE))
			return NULL;
	}
	if (!get_page_unless_zero(page))
		return NULL;

	return page;
}

#ifdef CONFIG_SWAP
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
			unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
	struct page *page = NULL;
	swp_entry_t ent = pte_to_swp_entry(ptent);

	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
		return NULL;
	/*
	 * Because lookup_swap_cache() updates some statistics counter,
	 * we call find_get_page() with swapper_space directly.
	 */
	page = find_get_page(swap_address_space(ent), ent.val);
	if (do_swap_account)
		entry->val = ent.val;

	return page;
}
#else
static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
			unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
	return NULL;
}
#endif

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
			unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
	struct page *page = NULL;
	struct address_space *mapping;
	pgoff_t pgoff;

	if (!vma->vm_file) /* anonymous vma */
		return NULL;
	if (!(mc.flags & MOVE_FILE))
		return NULL;

	mapping = vma->vm_file->f_mapping;
	pgoff = linear_page_index(vma, addr);

	/* page is moved even if it's not RSS of this task(page-faulted). */
#ifdef CONFIG_SWAP
	/* shmem/tmpfs may report page out on swap: account for that too. */
	if (shmem_mapping(mapping)) {
		page = find_get_entry(mapping, pgoff);
		if (radix_tree_exceptional_entry(page)) {
			swp_entry_t swp = radix_to_swp_entry(page);
			if (do_swap_account)
				*entry = swp;
			page = find_get_page(swap_address_space(swp), swp.val);
		}
	} else
		page = find_get_page(mapping, pgoff);
#else
	page = find_get_page(mapping, pgoff);
#endif
	return page;
}

/**
 * mem_cgroup_move_account - move account of the page
 * @page: the page
 * @nr_pages: number of regular pages (>1 for huge pages)
 * @from: mem_cgroup which the page is moved from.
 * @to:	mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
 * - page is not on LRU (isolate_page() is useful.)
 * - compound_lock is held when nr_pages > 1
 *
 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
 * from old cgroup.
 */
static int mem_cgroup_move_account(struct page *page,
				   unsigned int nr_pages,
				   struct mem_cgroup *from,
				   struct mem_cgroup *to)
{
	unsigned long flags;
	int ret;

	VM_BUG_ON(from == to);
	VM_BUG_ON_PAGE(PageLRU(page), page);
	/*
	 * The page is isolated from LRU. So, collapse function
	 * will not handle this page. But page splitting can happen.
	 * Do this check under compound_page_lock(). The caller should
	 * hold it.
	 */
	ret = -EBUSY;
	if (nr_pages > 1 && !PageTransHuge(page))
		goto out;

	/*
	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
	 * of its source page while we change it: page migration takes
	 * both pages off the LRU, but page cache replacement doesn't.
	 */
	if (!trylock_page(page))
		goto out;

	ret = -EINVAL;
	if (page->mem_cgroup != from)
		goto out_unlock;

	spin_lock_irqsave(&from->move_lock, flags);

	if (!PageAnon(page) && page_mapped(page)) {
		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
			       nr_pages);
		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
			       nr_pages);
	}

	if (PageWriteback(page)) {
		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
			       nr_pages);
		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
			       nr_pages);
	}

	/*
	 * It is safe to change page->mem_cgroup here because the page
	 * is referenced, charged, and isolated - we can't race with
	 * uncharging, charging, migration, or LRU putback.
	 */

	/* caller should have done css_get */
	page->mem_cgroup = to;
	spin_unlock_irqrestore(&from->move_lock, flags);

	ret = 0;

	local_irq_disable();
	mem_cgroup_charge_statistics(to, page, nr_pages);
	memcg_check_events(to, page);
	mem_cgroup_charge_statistics(from, page, -nr_pages);
	memcg_check_events(from, page);
	local_irq_enable();
out_unlock:
	unlock_page(page);
out:
	return ret;
}

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
		unsigned long addr, pte_t ptent, union mc_target *target)
{
	struct page *page = NULL;
	enum mc_target_type ret = MC_TARGET_NONE;
	swp_entry_t ent = { .val = 0 };

	if (pte_present(ptent))
		page = mc_handle_present_pte(vma, addr, ptent);
	else if (is_swap_pte(ptent))
		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
	else if (pte_none(ptent))
		page = mc_handle_file_pte(vma, addr, ptent, &ent);

	if (!page && !ent.val)
		return ret;
	if (page) {
		/*
		 * Do only loose check w/o serialization.
		 * mem_cgroup_move_account() checks the page is valid or
		 * not under LRU exclusion.
		 */
		if (page->mem_cgroup == mc.from) {
			ret = MC_TARGET_PAGE;
			if (target)
				target->page = page;
		}
		if (!ret || !target)
			put_page(page);
	}
	/* There is a swap entry and a page doesn't exist or isn't charged */
	if (ent.val && !ret &&
	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
		ret = MC_TARGET_SWAP;
		if (target)
			target->ent = ent;
	}
	return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * We don't consider swapping or file mapped pages because THP does not
 * support them for now.
 * Caller should make sure that pmd_trans_huge(pmd) is true.
 */
static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
		unsigned long addr, pmd_t pmd, union mc_target *target)
{
	struct page *page = NULL;
	enum mc_target_type ret = MC_TARGET_NONE;

	page = pmd_page(pmd);
	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
	if (!(mc.flags & MOVE_ANON))
		return ret;
	if (page->mem_cgroup == mc.from) {
		ret = MC_TARGET_PAGE;
		if (target) {
			get_page(page);
			target->page = page;
		}
	}
	return ret;
}
#else
static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
		unsigned long addr, pmd_t pmd, union mc_target *target)
{
	return MC_TARGET_NONE;
}
#endif

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
					unsigned long addr, unsigned long end,
					struct mm_walk *walk)
{
	struct vm_area_struct *vma = walk->vma;
	pte_t *pte;
	spinlock_t *ptl;

	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
			mc.precharge += HPAGE_PMD_NR;
		spin_unlock(ptl);
		return 0;
	}

	if (pmd_trans_unstable(pmd))
		return 0;
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	for (; addr != end; pte++, addr += PAGE_SIZE)
		if (get_mctgt_type(vma, addr, *pte, NULL))
			mc.precharge++;	/* increment precharge temporarily */
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();

	return 0;
}

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
	unsigned long precharge;

	struct mm_walk mem_cgroup_count_precharge_walk = {
		.pmd_entry = mem_cgroup_count_precharge_pte_range,
		.mm = mm,
	};
	down_read(&mm->mmap_sem);
	walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
	up_read(&mm->mmap_sem);

	precharge = mc.precharge;
	mc.precharge = 0;

	return precharge;
}

static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
	unsigned long precharge = mem_cgroup_count_precharge(mm);

	VM_BUG_ON(mc.moving_task);
	mc.moving_task = current;
	return mem_cgroup_do_precharge(precharge);
}

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
static void __mem_cgroup_clear_mc(void)
{
	struct mem_cgroup *from = mc.from;
	struct mem_cgroup *to = mc.to;

	/* we must uncharge all the leftover precharges from mc.to */
	if (mc.precharge) {
		cancel_charge(mc.to, mc.precharge);
		mc.precharge = 0;
	}
	/*
	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
	 * we must uncharge here.
	 */
	if (mc.moved_charge) {
		cancel_charge(mc.from, mc.moved_charge);
		mc.moved_charge = 0;
	}
	/* we must fixup refcnts and charges */
	if (mc.moved_swap) {
		/* uncharge swap account from the old cgroup */
		if (!mem_cgroup_is_root(mc.from))
			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);

		/*
		 * we charged both to->memory and to->memsw, so we
		 * should uncharge to->memory.
		 */
		if (!mem_cgroup_is_root(mc.to))
			page_counter_uncharge(&mc.to->memory, mc.moved_swap);

		css_put_many(&mc.from->css, mc.moved_swap);

		/* we've already done css_get(mc.to) */
		mc.moved_swap = 0;
	}
	memcg_oom_recover(from);
	memcg_oom_recover(to);
	wake_up_all(&mc.waitq);
}

static void mem_cgroup_clear_mc(void)
{
	/*
	 * we must clear moving_task before waking up waiters at the end of
	 * task migration.
	 */
	mc.moving_task = NULL;
	__mem_cgroup_clear_mc();
	spin_lock(&mc.lock);
	mc.from = NULL;
	mc.to = NULL;
	spin_unlock(&mc.lock);
}

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
				 struct cgroup_taskset *tset)
{
	struct task_struct *p = cgroup_taskset_first(tset);
	int ret = 0;
	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
	unsigned long move_flags;

	/*
	 * We are now commited to this value whatever it is. Changes in this
	 * tunable will only affect upcoming migrations, not the current one.
	 * So we need to save it, and keep it going.
	 */
	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
	if (move_flags) {
		struct mm_struct *mm;
		struct mem_cgroup *from = mem_cgroup_from_task(p);

		VM_BUG_ON(from == memcg);

		mm = get_task_mm(p);
		if (!mm)
			return 0;
		/* We move charges only when we move a owner of the mm */
		if (mm->owner == p) {
			VM_BUG_ON(mc.from);
			VM_BUG_ON(mc.to);
			VM_BUG_ON(mc.precharge);
			VM_BUG_ON(mc.moved_charge);
			VM_BUG_ON(mc.moved_swap);

			spin_lock(&mc.lock);
			mc.from = from;
			mc.to = memcg;
			mc.flags = move_flags;
			spin_unlock(&mc.lock);
			/* We set mc.moving_task later */

			ret = mem_cgroup_precharge_mc(mm);
			if (ret)
				mem_cgroup_clear_mc();
		}
		mmput(mm);
	}
	return ret;
}

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
				     struct cgroup_taskset *tset)
{
	if (mc.to)
		mem_cgroup_clear_mc();
}

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
				unsigned long addr, unsigned long end,
				struct mm_walk *walk)
{
	int ret = 0;
	struct vm_area_struct *vma = walk->vma;
	pte_t *pte;
	spinlock_t *ptl;
	enum mc_target_type target_type;
	union mc_target target;
	struct page *page;

	/*
	 * We don't take compound_lock() here but no race with splitting thp
	 * happens because:
	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
	 *    under splitting, which means there's no concurrent thp split,
	 *  - if another thread runs into split_huge_page() just after we
	 *    entered this if-block, the thread must wait for page table lock
	 *    to be unlocked in __split_huge_page_splitting(), where the main
	 *    part of thp split is not executed yet.
	 */
	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
		if (mc.precharge < HPAGE_PMD_NR) {
			spin_unlock(ptl);
			return 0;
		}
		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
		if (target_type == MC_TARGET_PAGE) {
			page = target.page;
			if (!isolate_lru_page(page)) {
				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
							     mc.from, mc.to)) {
					mc.precharge -= HPAGE_PMD_NR;
					mc.moved_charge += HPAGE_PMD_NR;
				}
				putback_lru_page(page);
			}
			put_page(page);
		}
		spin_unlock(ptl);
		return 0;
	}

	if (pmd_trans_unstable(pmd))
		return 0;
retry:
	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
	for (; addr != end; addr += PAGE_SIZE) {
		pte_t ptent = *(pte++);
		swp_entry_t ent;

		if (!mc.precharge)
			break;

		switch (get_mctgt_type(vma, addr, ptent, &target)) {
		case MC_TARGET_PAGE:
			page = target.page;
			if (isolate_lru_page(page))
				goto put;
			if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
				mc.precharge--;
				/* we uncharge from mc.from later. */
				mc.moved_charge++;
			}
			putback_lru_page(page);
put:			/* get_mctgt_type() gets the page */
			put_page(page);
			break;
		case MC_TARGET_SWAP:
			ent = target.ent;
			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
				mc.precharge--;
				/* we fixup refcnts and charges later. */
				mc.moved_swap++;
			}
			break;
		default:
			break;
		}
	}
	pte_unmap_unlock(pte - 1, ptl);
	cond_resched();

	if (addr != end) {
		/*
		 * We have consumed all precharges we got in can_attach().
		 * We try charge one by one, but don't do any additional
		 * charges to mc.to if we have failed in charge once in attach()
		 * phase.
		 */
		ret = mem_cgroup_do_precharge(1);
		if (!ret)
			goto retry;
	}

	return ret;
}

static void mem_cgroup_move_charge(struct mm_struct *mm)
{
	struct mm_walk mem_cgroup_move_charge_walk = {
		.pmd_entry = mem_cgroup_move_charge_pte_range,
		.mm = mm,
	};

	lru_add_drain_all();
	/*
	 * Signal mem_cgroup_begin_page_stat() to take the memcg's
	 * move_lock while we're moving its pages to another memcg.
	 * Then wait for already started RCU-only updates to finish.
	 */
	atomic_inc(&mc.from->moving_account);
	synchronize_rcu();
retry:
	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
		/*
		 * Someone who are holding the mmap_sem might be waiting in
		 * waitq. So we cancel all extra charges, wake up all waiters,
		 * and retry. Because we cancel precharges, we might not be able
		 * to move enough charges, but moving charge is a best-effort
		 * feature anyway, so it wouldn't be a big problem.
		 */
		__mem_cgroup_clear_mc();
		cond_resched();
		goto retry;
	}
	/*
	 * When we have consumed all precharges and failed in doing
	 * additional charge, the page walk just aborts.
	 */
	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
	up_read(&mm->mmap_sem);
	atomic_dec(&mc.from->moving_account);
}

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
				 struct cgroup_taskset *tset)
{
	struct task_struct *p = cgroup_taskset_first(tset);
	struct mm_struct *mm = get_task_mm(p);

	if (mm) {
		if (mc.to)
			mem_cgroup_move_charge(mm);
		mmput(mm);
	}
	if (mc.to)
		mem_cgroup_clear_mc();
}
#else	/* !CONFIG_MMU */
static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
				 struct cgroup_taskset *tset)
{
	return 0;
}
static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
				     struct cgroup_taskset *tset)
{
}
static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
				 struct cgroup_taskset *tset)
{
}
#endif

/*
 * Cgroup retains root cgroups across [un]mount cycles making it necessary
 * to verify whether we're attached to the default hierarchy on each mount
 * attempt.
 */
static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
{
	/*
	 * use_hierarchy is forced on the default hierarchy.  cgroup core
	 * guarantees that @root doesn't have any children, so turning it
	 * on for the root memcg is enough.
	 */
	if (cgroup_on_dfl(root_css->cgroup))
		root_mem_cgroup->use_hierarchy = true;
	else
		root_mem_cgroup->use_hierarchy = false;
}

static u64 memory_current_read(struct cgroup_subsys_state *css,
			       struct cftype *cft)
{
	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
}

static int memory_low_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	unsigned long low = READ_ONCE(memcg->low);

	if (low == PAGE_COUNTER_MAX)
		seq_puts(m, "max\n");
	else
		seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);

	return 0;
}

static ssize_t memory_low_write(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long low;
	int err;

	buf = strstrip(buf);
	err = page_counter_memparse(buf, "max", &low);
	if (err)
		return err;

	memcg->low = low;

	return nbytes;
}

static int memory_high_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	unsigned long high = READ_ONCE(memcg->high);

	if (high == PAGE_COUNTER_MAX)
		seq_puts(m, "max\n");
	else
		seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);

	return 0;
}

static ssize_t memory_high_write(struct kernfs_open_file *of,
				 char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long high;
	int err;

	buf = strstrip(buf);
	err = page_counter_memparse(buf, "max", &high);
	if (err)
		return err;

	memcg->high = high;

	return nbytes;
}

static int memory_max_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
	unsigned long max = READ_ONCE(memcg->memory.limit);

	if (max == PAGE_COUNTER_MAX)
		seq_puts(m, "max\n");
	else
		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);

	return 0;
}

static ssize_t memory_max_write(struct kernfs_open_file *of,
				char *buf, size_t nbytes, loff_t off)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
	unsigned long max;
	int err;

	buf = strstrip(buf);
	err = page_counter_memparse(buf, "max", &max);
	if (err)
		return err;

	err = mem_cgroup_resize_limit(memcg, max);
	if (err)
		return err;

	return nbytes;
}

static int memory_events_show(struct seq_file *m, void *v)
{
	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

	seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
	seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
	seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
	seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));

	return 0;
}

static struct cftype memory_files[] = {
	{
		.name = "current",
		.read_u64 = memory_current_read,
	},
	{
		.name = "low",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_low_show,
		.write = memory_low_write,
	},
	{
		.name = "high",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_high_show,
		.write = memory_high_write,
	},
	{
		.name = "max",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_max_show,
		.write = memory_max_write,
	},
	{
		.name = "events",
		.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = memory_events_show,
	},
	{ }	/* terminate */
};

struct cgroup_subsys memory_cgrp_subsys = {
	.css_alloc = mem_cgroup_css_alloc,
	.css_online = mem_cgroup_css_online,
	.css_offline = mem_cgroup_css_offline,
	.css_free = mem_cgroup_css_free,
	.css_reset = mem_cgroup_css_reset,
	.can_attach = mem_cgroup_can_attach,
	.cancel_attach = mem_cgroup_cancel_attach,
	.attach = mem_cgroup_move_task,
	.bind = mem_cgroup_bind,
	.dfl_cftypes = memory_files,
	.legacy_cftypes = mem_cgroup_legacy_files,
	.early_init = 0,
};

/**
 * mem_cgroup_events - count memory events against a cgroup
 * @memcg: the memory cgroup
 * @idx: the event index
 * @nr: the number of events to account for
 */
void mem_cgroup_events(struct mem_cgroup *memcg,
		       enum mem_cgroup_events_index idx,
		       unsigned int nr)
{
	this_cpu_add(memcg->stat->events[idx], nr);
}

/**
 * mem_cgroup_low - check if memory consumption is below the normal range
 * @root: the highest ancestor to consider
 * @memcg: the memory cgroup to check
 *
 * Returns %true if memory consumption of @memcg, and that of all
 * configurable ancestors up to @root, is below the normal range.
 */
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
	if (mem_cgroup_disabled())
		return false;

	/*
	 * The toplevel group doesn't have a configurable range, so
	 * it's never low when looked at directly, and it is not
	 * considered an ancestor when assessing the hierarchy.
	 */

	if (memcg == root_mem_cgroup)
		return false;

	if (page_counter_read(&memcg->memory) >= memcg->low)
		return false;

	while (memcg != root) {
		memcg = parent_mem_cgroup(memcg);

		if (memcg == root_mem_cgroup)
			break;

		if (page_counter_read(&memcg->memory) >= memcg->low)
			return false;
	}
	return true;
}

/**
 * mem_cgroup_try_charge - try charging a page
 * @page: page to charge
 * @mm: mm context of the victim
 * @gfp_mask: reclaim mode
 * @memcgp: charged memcg return
 *
 * Try to charge @page to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp_mask if necessary.
 *
 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
 * Otherwise, an error code is returned.
 *
 * After page->mapping has been set up, the caller must finalize the
 * charge with mem_cgroup_commit_charge().  Or abort the transaction
 * with mem_cgroup_cancel_charge() in case page instantiation fails.
 */
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
	struct mem_cgroup *memcg = NULL;
	unsigned int nr_pages = 1;
	int ret = 0;

	if (mem_cgroup_disabled())
		goto out;

	if (PageSwapCache(page)) {
		/*
		 * Every swap fault against a single page tries to charge the
		 * page, bail as early as possible.  shmem_unuse() encounters
		 * already charged pages, too.  The USED bit is protected by
		 * the page lock, which serializes swap cache removal, which
		 * in turn serializes uncharging.
		 */
		if (page->mem_cgroup)
			goto out;
	}

	if (PageTransHuge(page)) {
		nr_pages <<= compound_order(page);
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
	}

	if (do_swap_account && PageSwapCache(page))
		memcg = try_get_mem_cgroup_from_page(page);
	if (!memcg)
		memcg = get_mem_cgroup_from_mm(mm);

	ret = try_charge(memcg, gfp_mask, nr_pages);

	css_put(&memcg->css);

	if (ret == -EINTR) {
		memcg = root_mem_cgroup;
		ret = 0;
	}
out:
	*memcgp = memcg;
	return ret;
}

/**
 * mem_cgroup_commit_charge - commit a page charge
 * @page: page to charge
 * @memcg: memcg to charge the page to
 * @lrucare: page might be on LRU already
 *
 * Finalize a charge transaction started by mem_cgroup_try_charge(),
 * after page->mapping has been set up.  This must happen atomically
 * as part of the page instantiation, i.e. under the page table lock
 * for anonymous pages, under the page lock for page and swap cache.
 *
 * In addition, the page must not be on the LRU during the commit, to
 * prevent racing with task migration.  If it might be, use @lrucare.
 *
 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
 */
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
			      bool lrucare)
{
	unsigned int nr_pages = 1;

	VM_BUG_ON_PAGE(!page->mapping, page);
	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);

	if (mem_cgroup_disabled())
		return;
	/*
	 * Swap faults will attempt to charge the same page multiple
	 * times.  But reuse_swap_page() might have removed the page
	 * from swapcache already, so we can't check PageSwapCache().
	 */
	if (!memcg)
		return;

	commit_charge(page, memcg, lrucare);

	if (PageTransHuge(page)) {
		nr_pages <<= compound_order(page);
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
	}

	local_irq_disable();
	mem_cgroup_charge_statistics(memcg, page, nr_pages);
	memcg_check_events(memcg, page);
	local_irq_enable();

	if (do_swap_account && PageSwapCache(page)) {
		swp_entry_t entry = { .val = page_private(page) };
		/*
		 * The swap entry might not get freed for a long time,
		 * let's not wait for it.  The page already received a
		 * memory+swap charge, drop the swap entry duplicate.
		 */
		mem_cgroup_uncharge_swap(entry);
	}
}

/**
 * mem_cgroup_cancel_charge - cancel a page charge
 * @page: page to charge
 * @memcg: memcg to charge the page to
 *
 * Cancel a charge transaction started by mem_cgroup_try_charge().
 */
void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
{
	unsigned int nr_pages = 1;

	if (mem_cgroup_disabled())
		return;
	/*
	 * Swap faults will attempt to charge the same page multiple
	 * times.  But reuse_swap_page() might have removed the page
	 * from swapcache already, so we can't check PageSwapCache().
	 */
	if (!memcg)
		return;

	if (PageTransHuge(page)) {
		nr_pages <<= compound_order(page);
		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
	}

	cancel_charge(memcg, nr_pages);
}

static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
			   unsigned long nr_anon, unsigned long nr_file,
			   unsigned long nr_huge, struct page *dummy_page)
{
	unsigned long nr_pages = nr_anon + nr_file;
	unsigned long flags;

	if (!mem_cgroup_is_root(memcg)) {
		page_counter_uncharge(&memcg->memory, nr_pages);
		if (do_swap_account)
			page_counter_uncharge(&memcg->memsw, nr_pages);
		memcg_oom_recover(memcg);
	}

	local_irq_save(flags);
	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
	memcg_check_events(memcg, dummy_page);
	local_irq_restore(flags);

	if (!mem_cgroup_is_root(memcg))
		css_put_many(&memcg->css, nr_pages);
}

static void uncharge_list(struct list_head *page_list)
{
	struct mem_cgroup *memcg = NULL;
	unsigned long nr_anon = 0;
	unsigned long nr_file = 0;
	unsigned long nr_huge = 0;
	unsigned long pgpgout = 0;
	struct list_head *next;
	struct page *page;

	next = page_list->next;
	do {
		unsigned int nr_pages = 1;

		page = list_entry(next, struct page, lru);
		next = page->lru.next;

		VM_BUG_ON_PAGE(PageLRU(page), page);
		VM_BUG_ON_PAGE(page_count(page), page);

		if (!page->mem_cgroup)
			continue;

		/*
		 * Nobody should be changing or seriously looking at
		 * page->mem_cgroup at this point, we have fully
		 * exclusive access to the page.
		 */

		if (memcg != page->mem_cgroup) {
			if (memcg) {
				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
					       nr_huge, page);
				pgpgout = nr_anon = nr_file = nr_huge = 0;
			}
			memcg = page->mem_cgroup;
		}

		if (PageTransHuge(page)) {
			nr_pages <<= compound_order(page);
			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
			nr_huge += nr_pages;
		}

		if (PageAnon(page))
			nr_anon += nr_pages;
		else
			nr_file += nr_pages;

		page->mem_cgroup = NULL;

		pgpgout++;
	} while (next != page_list);

	if (memcg)
		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
			       nr_huge, page);
}

/**
 * mem_cgroup_uncharge - uncharge a page
 * @page: page to uncharge
 *
 * Uncharge a page previously charged with mem_cgroup_try_charge() and
 * mem_cgroup_commit_charge().
 */
void mem_cgroup_uncharge(struct page *page)
{
	if (mem_cgroup_disabled())
		return;

	/* Don't touch page->lru of any random page, pre-check: */
	if (!page->mem_cgroup)
		return;

	INIT_LIST_HEAD(&page->lru);
	uncharge_list(&page->lru);
}

/**
 * mem_cgroup_uncharge_list - uncharge a list of page
 * @page_list: list of pages to uncharge
 *
 * Uncharge a list of pages previously charged with
 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
 */
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
	if (mem_cgroup_disabled())
		return;

	if (!list_empty(page_list))
		uncharge_list(page_list);
}

/**
 * mem_cgroup_migrate - migrate a charge to another page
 * @oldpage: currently charged page
 * @newpage: page to transfer the charge to
 * @lrucare: either or both pages might be on the LRU already
 *
 * Migrate the charge from @oldpage to @newpage.
 *
 * Both pages must be locked, @newpage->mapping must be set up.
 */
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
			bool lrucare)
{
	struct mem_cgroup *memcg;
	int isolated;

	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
		       newpage);

	if (mem_cgroup_disabled())
		return;

	/* Page cache replacement: new page already charged? */
	if (newpage->mem_cgroup)
		return;

	/*
	 * Swapcache readahead pages can get migrated before being
	 * charged, and migration from compaction can happen to an
	 * uncharged page when the PFN walker finds a page that
	 * reclaim just put back on the LRU but has not released yet.
	 */
	memcg = oldpage->mem_cgroup;
	if (!memcg)
		return;

	if (lrucare)
		lock_page_lru(oldpage, &isolated);

	oldpage->mem_cgroup = NULL;

	if (lrucare)
		unlock_page_lru(oldpage, isolated);

	commit_charge(newpage, memcg, lrucare);
}

/*
 * subsys_initcall() for memory controller.
 *
 * Some parts like hotcpu_notifier() have to be initialized from this context
 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
 * everything that doesn't depend on a specific mem_cgroup structure should
 * be initialized from here.
 */
static int __init mem_cgroup_init(void)
{
	int cpu, node;

	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

	for_each_possible_cpu(cpu)
		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
			  drain_local_stock);

	for_each_node(node) {
		struct mem_cgroup_tree_per_node *rtpn;
		int zone;

		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
				    node_online(node) ? node : NUMA_NO_NODE);

		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
			struct mem_cgroup_tree_per_zone *rtpz;

			rtpz = &rtpn->rb_tree_per_zone[zone];
			rtpz->rb_root = RB_ROOT;
			spin_lock_init(&rtpz->lock);
		}
		soft_limit_tree.rb_tree_per_node[node] = rtpn;
	}

	return 0;
}
subsys_initcall(mem_cgroup_init);

#ifdef CONFIG_MEMCG_SWAP
/**
 * mem_cgroup_swapout - transfer a memsw charge to swap
 * @page: page whose memsw charge to transfer
 * @entry: swap entry to move the charge to
 *
 * Transfer the memsw charge of @page to @entry.
 */
void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
{
	struct mem_cgroup *memcg;
	unsigned short oldid;

	VM_BUG_ON_PAGE(PageLRU(page), page);
	VM_BUG_ON_PAGE(page_count(page), page);

	if (!do_swap_account)
		return;

	memcg = page->mem_cgroup;

	/* Readahead page, never charged */
	if (!memcg)
		return;

	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
	VM_BUG_ON_PAGE(oldid, page);
	mem_cgroup_swap_statistics(memcg, true);

	page->mem_cgroup = NULL;

	if (!mem_cgroup_is_root(memcg))
		page_counter_uncharge(&memcg->memory, 1);

	/* Caller disabled preemption with mapping->tree_lock */
	mem_cgroup_charge_statistics(memcg, page, -1);
	memcg_check_events(memcg, page);
}

/**
 * mem_cgroup_uncharge_swap - uncharge a swap entry
 * @entry: swap entry to uncharge
 *
 * Drop the memsw charge associated with @entry.
 */
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
	struct mem_cgroup *memcg;
	unsigned short id;

	if (!do_swap_account)
		return;

	id = swap_cgroup_record(entry, 0);
	rcu_read_lock();
	memcg = mem_cgroup_from_id(id);
	if (memcg) {
		if (!mem_cgroup_is_root(memcg))
			page_counter_uncharge(&memcg->memsw, 1);
		mem_cgroup_swap_statistics(memcg, false);
		css_put(&memcg->css);
	}
	rcu_read_unlock();
}

/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata;
#endif

static int __init enable_swap_account(char *s)
{
	if (!strcmp(s, "1"))
		really_do_swap_account = 1;
	else if (!strcmp(s, "0"))
		really_do_swap_account = 0;
	return 1;
}
__setup("swapaccount=", enable_swap_account);

static struct cftype memsw_cgroup_files[] = {
	{
		.name = "memsw.usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "memsw.max_usage_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "memsw.limit_in_bytes",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
		.write = mem_cgroup_write,
		.read_u64 = mem_cgroup_read_u64,
	},
	{
		.name = "memsw.failcnt",
		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
		.write = mem_cgroup_reset,
		.read_u64 = mem_cgroup_read_u64,
	},
	{ },	/* terminate */
};

static int __init mem_cgroup_swap_init(void)
{
	if (!mem_cgroup_disabled() && really_do_swap_account) {
		do_swap_account = 1;
		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
						  memsw_cgroup_files));
	}
	return 0;
}
subsys_initcall(mem_cgroup_swap_init);

#endif /* CONFIG_MEMCG_SWAP */

This page was automatically generated by Elixir 0.1.2 (source)  •  Please report bugs by mail or directly through the issue tracker.