Merge tag 'v4.11-rc6' into sched/core, to pick up fixes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2017-04-11 09:05:36 +02:00
957 changed files with 13076 additions and 6865 deletions

View File

@@ -54,6 +54,10 @@
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/audit.h>
@@ -90,13 +94,34 @@ static u32 audit_default;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
/*
* If audit records are to be written to the netlink socket, audit_pid
* contains the pid of the auditd process and audit_nlk_portid contains
* the portid to use to send netlink messages to that process.
/* private audit network namespace index */
static unsigned int audit_net_id;
/**
* struct audit_net - audit private network namespace data
* @sk: communication socket
*/
int audit_pid;
static __u32 audit_nlk_portid;
struct audit_net {
struct sock *sk;
};
/**
* struct auditd_connection - kernel/auditd connection state
* @pid: auditd PID
* @portid: netlink portid
* @net: the associated network namespace
* @lock: spinlock to protect write access
*
* Description:
* This struct is RCU protected; you must either hold the RCU lock for reading
* or the included spinlock for writing.
*/
static struct auditd_connection {
int pid;
u32 portid;
struct net *net;
spinlock_t lock;
} auditd_conn;
/* If audit_rate_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -123,10 +148,6 @@ u32 audit_sig_sid = 0;
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
static unsigned int audit_net_id;
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -139,6 +160,7 @@ static LIST_HEAD(audit_freelist);
/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
static void kauditd_hold_skb(struct sk_buff *skb);
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
@@ -192,6 +214,43 @@ struct audit_reply {
struct sk_buff *skb;
};
/**
* auditd_test_task - Check to see if a given task is an audit daemon
* @task: the task to check
*
* Description:
* Return 1 if the task is a registered audit daemon, 0 otherwise.
*/
int auditd_test_task(const struct task_struct *task)
{
int rc;
rcu_read_lock();
rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0);
rcu_read_unlock();
return rc;
}
/**
* audit_get_sk - Return the audit socket for the given network namespace
* @net: the destination network namespace
*
* Description:
* Returns the sock pointer if valid, NULL otherwise. The caller must ensure
* that a reference is held for the network namespace while the sock is in use.
*/
static struct sock *audit_get_sk(const struct net *net)
{
struct audit_net *aunet;
if (!net)
return NULL;
aunet = net_generic(net, audit_net_id);
return aunet->sk;
}
static void audit_set_portid(struct audit_buffer *ab, __u32 portid)
{
if (ab) {
@@ -210,9 +269,7 @@ void audit_panic(const char *message)
pr_err("%s\n", message);
break;
case AUDIT_FAIL_PANIC:
/* test audit_pid since printk is always losey, why bother? */
if (audit_pid)
panic("audit: %s\n", message);
panic("audit: %s\n", message);
break;
}
}
@@ -370,21 +427,87 @@ static int audit_set_failure(u32 state)
return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
* For one reason or another this nlh isn't getting delivered to the userspace
* audit daemon, just send it to printk.
/**
* auditd_set - Set/Reset the auditd connection state
* @pid: auditd PID
* @portid: auditd netlink portid
* @net: auditd network namespace pointer
*
* Description:
* This function will obtain and drop network namespace references as
* necessary.
*/
static void auditd_set(int pid, u32 portid, struct net *net)
{
unsigned long flags;
spin_lock_irqsave(&auditd_conn.lock, flags);
auditd_conn.pid = pid;
auditd_conn.portid = portid;
if (auditd_conn.net)
put_net(auditd_conn.net);
if (net)
auditd_conn.net = get_net(net);
else
auditd_conn.net = NULL;
spin_unlock_irqrestore(&auditd_conn.lock, flags);
}
/**
* auditd_reset - Disconnect the auditd connection
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
* hold queue in case auditd reconnects.
*/
static void auditd_reset(void)
{
struct sk_buff *skb;
/* if it isn't already broken, break the connection */
rcu_read_lock();
if (auditd_conn.pid)
auditd_set(0, 0, NULL);
rcu_read_unlock();
/* flush all of the main and retry queues to the hold queue */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
while ((skb = skb_dequeue(&audit_queue)))
kauditd_hold_skb(skb);
}
/**
* kauditd_print_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
* so write it via printk so the information isn't completely lost.
*/
static void kauditd_printk_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh = nlmsg_hdr(skb);
char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
if (printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
audit_log_lost("printk limit exceeded");
}
if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
}
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
static void kauditd_rehold_skb(struct sk_buff *skb)
{
/* put the record back in the queue at the same place */
skb_queue_head(&audit_hold_queue, skb);
/* fail the auditd connection */
auditd_reset();
}
/**
@@ -421,6 +544,9 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
kfree_skb(skb);
/* fail the auditd connection */
auditd_reset();
}
/**
@@ -441,51 +567,122 @@ static void kauditd_retry_skb(struct sk_buff *skb)
}
/**
* auditd_reset - Disconnect the auditd connection
* auditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
*
* Description:
* Break the auditd/kauditd connection and move all the records in the retry
* queue into the hold queue in case auditd reconnects. The audit_cmd_mutex
* must be held when calling this function.
* Send a skb to the audit daemon, returns positive/zero values on success and
* negative values on failure; in all cases the skb will be consumed by this
* function. If the send results in -ECONNREFUSED the connection with auditd
* will be reset. This function may sleep so callers should not hold any locks
* where this would cause a problem.
*/
static void auditd_reset(void)
static int auditd_send_unicast_skb(struct sk_buff *skb)
{
struct sk_buff *skb;
int rc;
u32 portid;
struct net *net;
struct sock *sk;
/* break the connection */
if (audit_sock) {
sock_put(audit_sock);
audit_sock = NULL;
/* NOTE: we can't call netlink_unicast while in the RCU section so
* take a reference to the network namespace and grab local
* copies of the namespace, the sock, and the portid; the
* namespace and sock aren't going to go away while we hold a
* reference and if the portid does become invalid after the RCU
* section netlink_unicast() should safely return an error */
rcu_read_lock();
if (!auditd_conn.pid) {
rcu_read_unlock();
rc = -ECONNREFUSED;
goto err;
}
audit_pid = 0;
audit_nlk_portid = 0;
net = auditd_conn.net;
get_net(net);
sk = audit_get_sk(net);
portid = auditd_conn.portid;
rcu_read_unlock();
/* flush all of the retry queue to the hold queue */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
rc = netlink_unicast(sk, skb, portid, 0);
put_net(net);
if (rc < 0)
goto err;
return rc;
err:
if (rc == -ECONNREFUSED)
auditd_reset();
return rc;
}
/**
* kauditd_send_unicast_skb - Send a record via unicast to auditd
* @skb: audit record
* kauditd_send_queue - Helper for kauditd_thread to flush skb queues
* @sk: the sending sock
* @portid: the netlink destination
* @queue: the skb queue to process
* @retry_limit: limit on number of netlink unicast failures
* @skb_hook: per-skb hook for additional processing
* @err_hook: hook called if the skb fails the netlink unicast send
*
* Description:
* Run through the given queue and attempt to send the audit records to auditd,
* returns zero on success, negative values on failure. It is up to the caller
* to ensure that the @sk is valid for the duration of this function.
*
*/
static int kauditd_send_unicast_skb(struct sk_buff *skb)
static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
void (*err_hook)(struct sk_buff *skb))
{
int rc;
int rc = 0;
struct sk_buff *skb;
static unsigned int failed = 0;
/* if we know nothing is connected, don't even try the netlink call */
if (!audit_pid)
return -ECONNREFUSED;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
/* get an extra skb reference in case we fail to send */
skb_get(skb);
rc = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (rc >= 0) {
consume_skb(skb);
rc = 0;
while ((skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
(*err_hook)(skb);
continue;
}
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
/* fatal failure for our queue flush attempt? */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
/* yes - error processing for the queue */
sk = NULL;
if (err_hook)
(*err_hook)(skb);
if (!skb_hook)
goto out;
/* keep processing with the skb_hook */
continue;
} else
/* no - requeue to preserve ordering */
skb_queue_head(queue, skb);
} else {
/* it worked - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
return rc;
out:
return (rc >= 0 ? 0 : rc);
}
/*
@@ -493,16 +690,19 @@ static int kauditd_send_unicast_skb(struct sk_buff *skb)
* @skb: audit record
*
* Description:
* This function doesn't consume an skb as might be expected since it has to
* copy it anyways.
* Write a multicast message to anyone listening in the initial network
* namespace. This function doesn't consume an skb as might be expected since
* it has to copy it anyways.
*/
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
struct sk_buff *copy;
struct audit_net *aunet = net_generic(&init_net, audit_net_id);
struct sock *sock = aunet->nlsk;
struct sock *sock = audit_get_sk(&init_net);
struct nlmsghdr *nlh;
/* NOTE: we are not taking an additional reference for init_net since
* we don't have to worry about it going away */
if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
return;
@@ -526,149 +726,75 @@ static void kauditd_send_multicast_skb(struct sk_buff *skb)
}
/**
* kauditd_wake_condition - Return true when it is time to wake kauditd_thread
*
* Description:
* This function is for use by the wait_event_freezable() call in
* kauditd_thread().
* kauditd_thread - Worker thread to send audit records to userspace
* @dummy: unused
*/
static int kauditd_wake_condition(void)
{
static int pid_last = 0;
int rc;
int pid = audit_pid;
/* wake on new messages or a change in the connected auditd */
rc = skb_queue_len(&audit_queue) || (pid && pid != pid_last);
if (rc)
pid_last = pid;
return rc;
}
static int kauditd_thread(void *dummy)
{
int rc;
int auditd = 0;
int reschedule = 0;
struct sk_buff *skb;
struct nlmsghdr *nlh;
u32 portid = 0;
struct net *net = NULL;
struct sock *sk = NULL;
#define UNICAST_RETRIES 5
#define AUDITD_BAD(x,y) \
((x) == -ECONNREFUSED || (x) == -EPERM || ++(y) >= UNICAST_RETRIES)
/* NOTE: we do invalidate the auditd connection flag on any sending
* errors, but we only "restore" the connection flag at specific places
* in the loop in order to help ensure proper ordering of audit
* records */
set_freezable();
while (!kthread_should_stop()) {
/* NOTE: possible area for future improvement is to look at
* the hold and retry queues, since only this thread
* has access to these queues we might be able to do
* our own queuing and skip some/all of the locking */
/* NOTE: it might be a fun experiment to split the hold and
* retry queue handling to another thread, but the
* synchronization issues and other overhead might kill
* any performance gains */
/* NOTE: see the lock comments in auditd_send_unicast_skb() */
rcu_read_lock();
if (!auditd_conn.pid) {
rcu_read_unlock();
goto main_queue;
}
net = auditd_conn.net;
get_net(net);
sk = audit_get_sk(net);
portid = auditd_conn.portid;
rcu_read_unlock();
/* attempt to flush the hold queue */
while (auditd && (skb = skb_dequeue(&audit_hold_queue))) {
rc = kauditd_send_unicast_skb(skb);
if (rc) {
/* requeue to the same spot */
skb_queue_head(&audit_hold_queue, skb);
auditd = 0;
if (AUDITD_BAD(rc, reschedule)) {
mutex_lock(&audit_cmd_mutex);
auditd_reset();
mutex_unlock(&audit_cmd_mutex);
reschedule = 0;
}
} else
/* we were able to send successfully */
reschedule = 0;
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
if (rc < 0) {
sk = NULL;
goto main_queue;
}
/* attempt to flush the retry queue */
while (auditd && (skb = skb_dequeue(&audit_retry_queue))) {
rc = kauditd_send_unicast_skb(skb);
if (rc) {
auditd = 0;
if (AUDITD_BAD(rc, reschedule)) {
kauditd_hold_skb(skb);
mutex_lock(&audit_cmd_mutex);
auditd_reset();
mutex_unlock(&audit_cmd_mutex);
reschedule = 0;
} else
/* temporary problem (we hope), queue
* to the same spot and retry */
skb_queue_head(&audit_retry_queue, skb);
} else
/* we were able to send successfully */
reschedule = 0;
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
if (rc < 0) {
sk = NULL;
goto main_queue;
}
/* standard queue processing, try to be as quick as possible */
quick_loop:
skb = skb_dequeue(&audit_queue);
if (skb) {
/* setup the netlink header, see the comments in
* kauditd_send_multicast_skb() for length quirks */
nlh = nlmsg_hdr(skb);
nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
* multicast send and move the record to the retry queue */
kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
kauditd_retry_skb);
/* attempt to send to any multicast listeners */
kauditd_send_multicast_skb(skb);
/* attempt to send to auditd, queue on failure */
if (auditd) {
rc = kauditd_send_unicast_skb(skb);
if (rc) {
auditd = 0;
if (AUDITD_BAD(rc, reschedule)) {
mutex_lock(&audit_cmd_mutex);
auditd_reset();
mutex_unlock(&audit_cmd_mutex);
reschedule = 0;
}
/* move to the retry queue */
kauditd_retry_skb(skb);
} else
/* everything is working so go fast! */
goto quick_loop;
} else if (reschedule)
/* we are currently having problems, move to
* the retry queue */
kauditd_retry_skb(skb);
else
/* dump the message via printk and hold it */
kauditd_hold_skb(skb);
} else {
/* we have flushed the backlog so wake everyone */
wake_up(&audit_backlog_wait);
/* if everything is okay with auditd (if present), go
* to sleep until there is something new in the queue
* or we have a change in the connected auditd;
* otherwise simply reschedule to give things a chance
* to recover */
if (reschedule) {
set_current_state(TASK_INTERRUPTIBLE);
schedule();
} else
wait_event_freezable(kauditd_wait,
kauditd_wake_condition());
/* update the auditd connection status */
auditd = (audit_pid ? 1 : 0);
/* drop our netns reference, no auditd sends past this line */
if (net) {
put_net(net);
net = NULL;
}
sk = NULL;
/* we have processed all the queues so wake everyone */
wake_up(&audit_backlog_wait);
/* NOTE: we want to wake up if there is anything on the queue,
* regardless of if an auditd is connected, as we need to
* do the multicast send and rotate records from the
* main queue to the retry/hold queues */
wait_event_freezable(kauditd_wait,
(skb_queue_len(&audit_queue) ? 1 : 0));
}
return 0;
@@ -678,17 +804,16 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
struct sock *sk = audit_get_sk(dest->net);
/* wait for parent to finish and send an ACK */
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
netlink_unicast(sk, skb, dest->portid, 0);
put_net(net);
put_net(dest->net);
kfree(dest);
return 0;
@@ -722,16 +847,15 @@ out_kfree_skb:
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
struct sock *sk = audit_get_sk(reply->net);
mutex_lock(&audit_cmd_mutex);
mutex_unlock(&audit_cmd_mutex);
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
put_net(net);
netlink_unicast(sk, reply->skb, reply->portid, 0);
put_net(reply->net);
kfree(reply);
return 0;
}
@@ -949,12 +1073,12 @@ static int audit_set_feature(struct sk_buff *skb)
static int audit_replace(pid_t pid)
{
struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0,
&pid, sizeof(pid));
struct sk_buff *skb;
skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid));
if (!skb)
return -ENOMEM;
return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
return auditd_send_unicast_skb(skb);
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -981,7 +1105,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
s.enabled = audit_enabled;
s.failure = audit_failure;
s.pid = audit_pid;
rcu_read_lock();
s.pid = auditd_conn.pid;
rcu_read_unlock();
s.rate_limit = audit_rate_limit;
s.backlog_limit = audit_backlog_limit;
s.lost = atomic_read(&audit_lost);
@@ -1014,30 +1140,44 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
* from the initial pid namespace, but something
* to keep in mind if this changes */
int new_pid = s.pid;
pid_t auditd_pid;
pid_t requesting_pid = task_tgid_vnr(current);
if ((!new_pid) && (requesting_pid != audit_pid)) {
audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
/* test the auditd connection */
audit_replace(requesting_pid);
rcu_read_lock();
auditd_pid = auditd_conn.pid;
/* only the current auditd can unregister itself */
if ((!new_pid) && (requesting_pid != auditd_pid)) {
rcu_read_unlock();
audit_log_config_change("audit_pid", new_pid,
auditd_pid, 0);
return -EACCES;
}
if (audit_pid && new_pid &&
audit_replace(requesting_pid) != -ECONNREFUSED) {
audit_log_config_change("audit_pid", new_pid, audit_pid, 0);
/* replacing a healthy auditd is not allowed */
if (auditd_pid && new_pid) {
rcu_read_unlock();
audit_log_config_change("audit_pid", new_pid,
auditd_pid, 0);
return -EEXIST;
}
rcu_read_unlock();
if (audit_enabled != AUDIT_OFF)
audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_log_config_change("audit_pid", new_pid,
auditd_pid, 1);
if (new_pid) {
if (audit_sock)
sock_put(audit_sock);
audit_pid = new_pid;
audit_nlk_portid = NETLINK_CB(skb).portid;
sock_hold(skb->sk);
audit_sock = skb->sk;
} else {
/* register a new auditd connection */
auditd_set(new_pid,
NETLINK_CB(skb).portid,
sock_net(NETLINK_CB(skb).sk));
/* try to process any backlog */
wake_up_interruptible(&kauditd_wait);
} else
/* unregister the auditd connection */
auditd_reset();
}
wake_up_interruptible(&kauditd_wait);
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
err = audit_set_rate_limit(s.rate_limit);
@@ -1090,7 +1230,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
break;
}
mutex_unlock(&audit_cmd_mutex);
audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.*s'",
@@ -1108,7 +1247,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
}
audit_set_portid(ab, NETLINK_CB(skb).portid);
audit_log_end(ab);
mutex_lock(&audit_cmd_mutex);
}
break;
case AUDIT_ADD_RULE:
@@ -1298,26 +1436,26 @@ static int __net_init audit_net_init(struct net *net)
struct audit_net *aunet = net_generic(net, audit_net_id);
aunet->nlsk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
if (aunet->nlsk == NULL) {
aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
if (aunet->sk == NULL) {
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
aunet->nlsk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
return 0;
}
static void __net_exit audit_net_exit(struct net *net)
{
struct audit_net *aunet = net_generic(net, audit_net_id);
struct sock *sock = aunet->nlsk;
mutex_lock(&audit_cmd_mutex);
if (sock == audit_sock)
auditd_reset();
mutex_unlock(&audit_cmd_mutex);
netlink_kernel_release(sock);
aunet->nlsk = NULL;
rcu_read_lock();
if (net == auditd_conn.net)
auditd_reset();
rcu_read_unlock();
netlink_kernel_release(aunet->sk);
}
static struct pernet_operations audit_net_ops __net_initdata = {
@@ -1335,20 +1473,24 @@ static int __init audit_init(void)
if (audit_initialized == AUDIT_DISABLED)
return 0;
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
memset(&auditd_conn, 0, sizeof(auditd_conn));
spin_lock_init(&auditd_conn.lock);
skb_queue_head_init(&audit_queue);
skb_queue_head_init(&audit_retry_queue);
skb_queue_head_init(&audit_hold_queue);
audit_initialized = AUDIT_INITIALIZED;
audit_enabled = audit_default;
audit_ever_enabled |= !!audit_default;
for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
INIT_LIST_HEAD(&audit_inode_hash[i]);
pr_info("initializing netlink subsys (%s)\n",
audit_default ? "enabled" : "disabled");
register_pernet_subsys(&audit_net_ops);
audit_initialized = AUDIT_INITIALIZED;
audit_enabled = audit_default;
audit_ever_enabled |= !!audit_default;
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
if (IS_ERR(kauditd_task)) {
int err = PTR_ERR(kauditd_task);
@@ -1519,20 +1661,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(!audit_filter(type, AUDIT_FILTER_TYPE)))
return NULL;
/* don't ever fail/sleep on these two conditions:
/* NOTE: don't ever fail/sleep on these two conditions:
* 1. auditd generated record - since we need auditd to drain the
* queue; also, when we are checking for auditd, compare PIDs using
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. audit command message - record types 1000 through 1099 inclusive
* are command messages/records used to manage the kernel subsystem
* and the audit userspace, blocking on these messages could cause
* problems under load so don't do it (note: not all of these
* command types are valid as record types, but it is quicker to
* just check two ints than a series of ints in a if/switch stmt) */
if (!((audit_pid && audit_pid == task_tgid_vnr(current)) ||
(type >= 1000 && type <= 1099))) {
long sleep_time = audit_backlog_wait_time;
* 2. generator holding the audit_cmd_mutex - we don't want to block
* while holding the mutex */
if (!(auditd_test_task(current) ||
(current == __mutex_owner(&audit_cmd_mutex)))) {
long stime = audit_backlog_wait_time;
while (audit_backlog_limit &&
(skb_queue_len(&audit_queue) > audit_backlog_limit)) {
@@ -1541,14 +1679,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if ((gfp_mask & __GFP_DIRECT_RECLAIM) &&
(sleep_time > 0)) {
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
sleep_time = schedule_timeout(sleep_time);
stime = schedule_timeout(stime);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -2127,15 +2264,27 @@ out:
*/
void audit_log_end(struct audit_buffer *ab)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
if (!ab)
return;
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
skb_queue_tail(&audit_queue, ab->skb);
wake_up_interruptible(&kauditd_wait);
if (audit_rate_check()) {
skb = ab->skb;
ab->skb = NULL;
}
/* setup the netlink header, see the comments in
* kauditd_send_multicast_skb() for length quirks */
nlh = nlmsg_hdr(skb);
nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;
/* queue the netlink packet and poke the kauditd thread */
skb_queue_tail(&audit_queue, skb);
wake_up_interruptible(&kauditd_wait);
} else
audit_log_lost("rate limit exceeded");
audit_buffer_free(ab);
}

View File

@@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context,
struct audit_names *n, const struct path *path,
int record_num, int *call_panic);
extern int audit_pid;
extern int auditd_test_task(const struct task_struct *task);
#define AUDIT_INODE_BUCKETS 32
extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -250,10 +250,6 @@ struct audit_netlink_list {
int audit_send_list(void *);
struct audit_net {
struct sock *nlsk;
};
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
@@ -337,14 +333,7 @@ extern u32 audit_sig_sid;
extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
extern int __audit_signal_info(int sig, struct task_struct *t);
static inline int audit_signal_info(int sig, struct task_struct *t)
{
if (unlikely((audit_pid && t->tgid == audit_pid) ||
(audit_signals && !audit_dummy_context())))
return __audit_signal_info(sig, t);
return 0;
}
extern int audit_signal_info(int sig, struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
extern struct list_head *audit_killed_trees(void);
#else

View File

@@ -762,7 +762,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
struct audit_entry *e;
enum audit_state state;
if (audit_pid && tsk->tgid == audit_pid)
if (auditd_test_task(tsk))
return AUDIT_DISABLED;
rcu_read_lock();
@@ -816,7 +816,7 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
struct audit_names *n;
if (audit_pid && tsk->tgid == audit_pid)
if (auditd_test_task(tsk))
return;
rcu_read_lock();
@@ -2249,26 +2249,27 @@ void __audit_ptrace(struct task_struct *t)
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
int __audit_signal_info(int sig, struct task_struct *t)
int audit_signal_info(int sig, struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct task_struct *tsk = current;
struct audit_context *ctx = tsk->audit_context;
kuid_t uid = current_uid(), t_uid = task_uid(t);
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
security_task_getsecid(tsk, &audit_sig_sid);
}
if (!audit_signals || audit_dummy_context())
return 0;
if (auditd_test_task(t) &&
(sig == SIGTERM || sig == SIGHUP ||
sig == SIGUSR1 || sig == SIGUSR2)) {
audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
audit_sig_uid = uid;
security_task_getsecid(tsk, &audit_sig_sid);
}
if (!audit_signals || audit_dummy_context())
return 0;
/* optimize the common case by putting first signal recipient directly
* in audit_context */
if (!ctx->target_pid) {

View File

@@ -30,18 +30,12 @@ struct bpf_htab {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
void __percpu *extra_elems;
struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
enum extra_elem_state {
HTAB_NOT_AN_EXTRA_ELEM = 0,
HTAB_EXTRA_ELEM_FREE,
HTAB_EXTRA_ELEM_USED
};
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
union {
@@ -56,7 +50,6 @@ struct htab_elem {
};
union {
struct rcu_head rcu;
enum extra_elem_state state;
struct bpf_lru_node lru_node;
};
u32 hash;
@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
static bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
static int prealloc_init(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
htab->elems = bpf_map_area_alloc(htab->elem_size *
htab->map.max_entries);
if (!htab_is_percpu(htab) && !htab_is_lru(htab))
num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems)
return -ENOMEM;
if (!htab_is_percpu(htab))
goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) {
for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
@@ -166,11 +167,11 @@ skip_percpu_elems:
if (htab_is_lru(htab))
bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node),
htab->elem_size, htab->map.max_entries);
htab->elem_size, num_entries);
else
pcpu_freelist_populate(&htab->freelist,
htab->elems + offsetof(struct htab_elem, fnode),
htab->elem_size, htab->map.max_entries);
htab->elem_size, num_entries);
return 0;
@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
static int alloc_extra_elems(struct bpf_htab *htab)
{
void __percpu *pptr;
struct htab_elem *__percpu *pptr, *l_new;
struct pcpu_freelist_node *l;
int cpu;
pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
for_each_possible_cpu(cpu) {
((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
HTAB_EXTRA_ELEM_FREE;
l = pcpu_freelist_pop(&htab->freelist);
/* pop will succeed, since prealloc_init()
* preallocated extra num_possible_cpus elements
*/
l_new = container_of(l, struct htab_elem, fnode);
*per_cpu_ptr(pptr, cpu) = l_new;
}
htab->extra_elems = pptr;
return 0;
@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock);
}
if (!percpu && !lru) {
/* lru itself can remove the least used element, so
* there is no need for an extra elem during map_update.
*/
err = alloc_extra_elems(htab);
if (err)
goto free_buckets;
}
if (prealloc) {
err = prealloc_init(htab);
if (err)
goto free_extra_elems;
goto free_buckets;
if (!percpu && !lru) {
/* lru itself can remove the least used element, so
* there is no need for an extra elem during map_update.
*/
err = alloc_extra_elems(htab);
if (err)
goto free_prealloc;
}
}
return &htab->map;
free_extra_elems:
free_percpu(htab->extra_elems);
free_prealloc:
prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
free_htab:
@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{
if (l->state == HTAB_EXTRA_ELEM_USED) {
l->state = HTAB_EXTRA_ELEM_FREE;
return;
}
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
if (htab_is_prealloc(htab)) {
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -610,47 +612,43 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
bool old_elem_exists)
struct htab_elem *old_elem)
{
u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new;
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
int err = 0;
if (prealloc) {
struct pcpu_freelist_node *l;
l = pcpu_freelist_pop(&htab->freelist);
if (!l)
err = -E2BIG;
else
l_new = container_of(l, struct htab_elem, fnode);
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
err = -E2BIG;
if (old_elem) {
/* if we're updating the existing element,
* use per-cpu extra elems to avoid freelist_pop/push
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
*pl_new = old_elem;
} else {
l_new = kmalloc(htab->elem_size,
GFP_ATOMIC | __GFP_NOWARN);
if (!l_new)
return ERR_PTR(-ENOMEM);
struct pcpu_freelist_node *l;
l = pcpu_freelist_pop(&htab->freelist);
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
}
}
if (err) {
if (!old_elem_exists)
return ERR_PTR(err);
/* if we're updating the existing element and the hash table
* is full, use per-cpu extra elems
*/
l_new = this_cpu_ptr(htab->extra_elems);
if (l_new->state != HTAB_EXTRA_ELEM_FREE)
return ERR_PTR(-E2BIG);
l_new->state = HTAB_EXTRA_ELEM_USED;
} else {
l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
if (atomic_inc_return(&htab->count) > htab->map.max_entries)
if (!old_elem) {
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
if (!l_new)
return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size);
@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
!!l_old);
l_old);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
@@ -744,7 +742,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
free_htab_elem(htab, l_old);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
}
ret = 0;
err:
@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
value, onallcpus);
} else {
l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false);
hash, true, onallcpus, NULL);
if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new);
goto err;
@@ -1024,8 +1023,7 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
hlist_nulls_del_rcu(&l->hash_node);
if (l->state != HTAB_EXTRA_ELEM_USED)
htab_elem_free(htab, l);
htab_elem_free(htab, l);
}
}
}
@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them.
*/
rcu_barrier();
if (htab->map.map_flags & BPF_F_NO_PREALLOC)
if (!htab_is_prealloc(htab))
delete_all_elements(htab);
else
prealloc_destroy(htab);

View File

@@ -765,38 +765,56 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
static int check_ptr_alignment(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int off, int size)
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size)
{
if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
if (off % size != 0) {
verbose("misaligned access off %d size %d\n",
off, size);
return -EACCES;
} else {
return 0;
}
}
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
/* misaligned access to packet is ok on x86,arm,arm64 */
return 0;
if (reg->id && size != 1) {
verbose("Unknown packet alignment. Only byte-sized access allowed\n");
verbose("Unknown alignment. Only byte-sized access allowed in packet access.\n");
return -EACCES;
}
/* skb->data is NET_IP_ALIGN-ed */
if (reg->type == PTR_TO_PACKET &&
(NET_IP_ALIGN + reg->off + off) % size != 0) {
if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
verbose("misaligned packet access off %d+%d+%d size %d\n",
NET_IP_ALIGN, reg->off, off, size);
return -EACCES;
}
return 0;
}
static int check_val_ptr_alignment(const struct bpf_reg_state *reg,
int size)
{
if (size != 1) {
verbose("Unknown alignment. Only byte-sized access allowed in value access.\n");
return -EACCES;
}
return 0;
}
static int check_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size)
{
switch (reg->type) {
case PTR_TO_PACKET:
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
check_pkt_ptr_alignment(reg, off, size);
case PTR_TO_MAP_VALUE_ADJ:
return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ? 0 :
check_val_ptr_alignment(reg, size);
default:
if (off % size != 0) {
verbose("misaligned access off %d size %d\n",
off, size);
return -EACCES;
}
return 0;
}
}
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -818,7 +836,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
if (size < 0)
return size;
err = check_ptr_alignment(env, reg, off, size);
err = check_ptr_alignment(reg, off, size);
if (err)
return err;
@@ -1925,6 +1943,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* register as unknown.
*/
if (env->allow_ptr_leaks &&
BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1973,14 +1992,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
regs[i].range = dst_reg->off;
/* keep the maximum range already checked */
regs[i].range = max(regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
reg->range = dst_reg->off;
reg->range = max(reg->range, dst_reg->off);
}
}

View File

@@ -1335,26 +1335,21 @@ static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
struct cpuhp_step *sp;
int ret = 0;
mutex_lock(&cpuhp_state_mutex);
if (state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN) {
ret = cpuhp_reserve_state(state);
if (ret < 0)
goto out;
return ret;
state = ret;
}
sp = cpuhp_get_step(state);
if (name && sp->name) {
ret = -EBUSY;
goto out;
}
if (name && sp->name)
return -EBUSY;
sp->startup.single = startup;
sp->teardown.single = teardown;
sp->name = name;
sp->multi_instance = multi_instance;
INIT_HLIST_HEAD(&sp->list);
out:
mutex_unlock(&cpuhp_state_mutex);
return ret;
}
@@ -1428,6 +1423,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
return -EINVAL;
get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !sp->startup.multi)
goto add_node;
@@ -1447,16 +1443,14 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
if (ret) {
if (sp->teardown.multi)
cpuhp_rollback_install(cpu, state, node);
goto err;
goto unlock;
}
}
add_node:
ret = 0;
mutex_lock(&cpuhp_state_mutex);
hlist_add_head(node, &sp->list);
unlock:
mutex_unlock(&cpuhp_state_mutex);
err:
put_online_cpus();
return ret;
}
@@ -1491,6 +1485,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
ret = cpuhp_store_callbacks(state, name, startup, teardown,
multi_instance);
@@ -1524,6 +1519,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
}
}
out:
mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
/*
* If the requested state is CPUHP_AP_ONLINE_DYN, return the
@@ -1547,6 +1543,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
@@ -1563,7 +1561,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
}
remove:
mutex_lock(&cpuhp_state_mutex);
hlist_del(node);
mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
@@ -1571,6 +1568,7 @@ remove:
return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);
/**
* __cpuhp_remove_state - Remove the callbacks for an hotplug machine state
* @state: The state to remove
@@ -1589,6 +1587,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
get_online_cpus();
mutex_lock(&cpuhp_state_mutex);
if (sp->multi_instance) {
WARN(!hlist_empty(&sp->list),
"Error: Removing state %d which has instances left.\n",
@@ -1613,6 +1612,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
remove:
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

View File

@@ -4256,7 +4256,7 @@ int perf_event_release_kernel(struct perf_event *event)
raw_spin_lock_irq(&ctx->lock);
/*
* Mark this even as STATE_DEAD, there is no external reference to it
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
*
* Anybody acquiring event->child_mutex after the below loop _must_
@@ -10417,21 +10417,22 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
again:
list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
group_entry)
perf_free_event(event, ctx);
raw_spin_lock_irq(&ctx->lock);
/*
* Destroy the task <-> ctx relation and mark the context dead.
*
* This is important because even though the task hasn't been
* exposed yet the context has been (through child_list).
*/
RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
put_task_struct(task); /* cannot be last */
raw_spin_unlock_irq(&ctx->lock);
list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
group_entry)
list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
perf_free_event(event, ctx);
if (!list_empty(&ctx->pinned_groups) ||
!list_empty(&ctx->flexible_groups))
goto again;
mutex_unlock(&ctx->mutex);
put_ctx(ctx);
}
}
@@ -10469,7 +10470,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
}
/*
* inherit a event from parent task to child task:
* Inherit a event from parent task to child task.
*
* Returns:
* - valid pointer on success
* - NULL for orphaned events
* - IS_ERR() on error
*/
static struct perf_event *
inherit_event(struct perf_event *parent_event,
@@ -10563,6 +10569,16 @@ inherit_event(struct perf_event *parent_event,
return child_event;
}
/*
* Inherits an event group.
*
* This will quietly suppress orphaned events; !inherit_event() is not an error.
* This matches with perf_event_release_kernel() removing all child events.
*
* Returns:
* - 0 on success
* - <0 on error
*/
static int inherit_group(struct perf_event *parent_event,
struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10577,6 +10593,11 @@ static int inherit_group(struct perf_event *parent_event,
child, NULL, child_ctx);
if (IS_ERR(leader))
return PTR_ERR(leader);
/*
* @leader can be NULL here because of is_orphaned_event(). In this
* case inherit_event() will create individual events, similar to what
* perf_group_detach() would do anyway.
*/
list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
child_ctr = inherit_event(sub, parent, parent_ctx,
child, leader, child_ctx);
@@ -10586,6 +10607,17 @@ static int inherit_group(struct perf_event *parent_event,
return 0;
}
/*
* Creates the child task context and tries to inherit the event-group.
*
* Clears @inherited_all on !attr.inherited or error. Note that we'll leave
* inherited_all set when we 'fail' to inherit an orphaned event; this is
* consistent with perf_event_release_kernel() removing all child events.
*
* Returns:
* - 0 on success
* - <0 on error
*/
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
@@ -10608,7 +10640,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -10670,7 +10701,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
break;
goto out_unlock;
}
/*
@@ -10686,7 +10717,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
break;
goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10714,6 +10745,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);

View File

@@ -2815,7 +2815,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
@@ -2899,6 +2898,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
rt_mutex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2907,6 +2908,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_unlock(q.lock_ptr);
}
} else {
struct rt_mutex *pi_mutex;
/*
* We have been woken up by futex_unlock_pi(), a timeout, or a
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2930,18 +2933,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
/*
* If fixup_pi_state_owner() faulted and was unable to handle
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
if (ret && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
/*
* If fixup_pi_state_owner() faulted and was unable to handle the
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but

View File

@@ -213,10 +213,9 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
*/
if (sem->count == 0)
break;
if (signal_pending_state(state, current)) {
ret = -EINTR;
goto out;
}
if (signal_pending_state(state, current))
goto out_nolock;
set_current_state(state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
@@ -224,12 +223,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
}
/* got the lock */
sem->count = -1;
out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return ret;
out_nolock:
list_del(&waiter.list);
if (!list_empty(&sem->wait_list))
__rwsem_do_wake(sem, 1);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return -EINTR;
}
void __sched __down_write(struct rw_semaphore *sem)

View File

@@ -247,11 +247,9 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
lock_device_hotplug();
mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
mem_hotplug_done();
unlock_device_hotplug();
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
@@ -364,11 +362,9 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
lock_device_hotplug();
mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
mem_hotplug_done();
unlock_device_hotplug();
if (error)
goto err_add_memory;

View File

@@ -186,19 +186,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
spin_unlock(&reorder->lock);
pd->processed++;
spin_unlock(&reorder->lock);
goto out;
}
spin_unlock(&reorder->lock);
if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);

View File

@@ -184,11 +184,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
WARN_ON(!task->ptrace || task->parent != current);
/*
* PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
* Recheck state under the lock to close this race.
*/
spin_lock_irq(&task->sighand->siglock);
if (__fatal_signal_pending(task))
wake_up_state(task, __TASK_TRACED);
else
task->state = TASK_TRACED;
if (task->state == __TASK_TRACED) {
if (__fatal_signal_pending(task))
wake_up_state(task, __TASK_TRACED);
else
task->state = TASK_TRACED;
}
spin_unlock_irq(&task->sighand->siglock);
}

View File

@@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable);
static int __sched_clock_stable_early = 1;
/*
* We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
* We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
*/
static __read_mostly u64 raw_offset;
static __read_mostly u64 gtod_offset;
__read_mostly u64 __sched_clock_offset;
static __read_mostly u64 __gtod_offset;
struct sched_clock_data {
u64 tick_raw;
@@ -131,17 +131,24 @@ static void __set_sched_clock_stable(void)
/*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
__sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);
printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
scd->tick_gtod, gtod_offset,
scd->tick_raw, raw_offset);
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
static_branch_enable(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static void __clear_sched_clock_stable(struct work_struct *work)
static void __sched_clock_work(struct work_struct *work)
{
static_branch_disable(&__sched_clock_stable);
}
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
static void __clear_sched_clock_stable(void)
{
struct sched_clock_data *scd = this_scd();
@@ -154,17 +161,17 @@ static void __clear_sched_clock_stable(struct work_struct *work)
*
* Still do what we can.
*/
gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
__gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);
printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
scd->tick_gtod, gtod_offset,
scd->tick_raw, raw_offset);
scd->tick_gtod, __gtod_offset,
scd->tick_raw, __sched_clock_offset);
static_branch_disable(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
if (sched_clock_stable())
schedule_work(&sched_clock_work);
}
void clear_sched_clock_stable(void)
{
@@ -173,7 +180,7 @@ void clear_sched_clock_stable(void)
smp_mb(); /* matches sched_clock_init_late() */
if (sched_clock_running == 2)
schedule_work(&sched_clock_work);
__clear_sched_clock_stable();
}
void sched_clock_init_late(void)
@@ -214,7 +221,7 @@ static inline u64 wrap_max(u64 x, u64 y)
*/
static u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock;
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
again:
@@ -231,9 +238,10 @@ again:
* scd->tick_gtod + TICK_NSEC);
*/
clock = scd->tick_gtod + gtod_offset + delta;
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
gtod = scd->tick_gtod + __gtod_offset;
clock = gtod + delta;
min_clock = wrap_max(gtod, old_clock);
max_clock = wrap_max(old_clock, gtod + TICK_NSEC);
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
@@ -317,7 +325,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;
if (sched_clock_stable())
return sched_clock() + raw_offset;
return sched_clock() + __sched_clock_offset;
if (unlikely(!sched_clock_running))
return 0ull;

View File

@@ -584,20 +584,14 @@ static int sugov_start(struct cpufreq_policy *policy)
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
if (policy_is_shared(policy)) {
sg_cpu->util = 0;
sg_cpu->max = 0;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->last_update = 0;
sg_cpu->iowait_boost = 0;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_single);
}
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
sugov_update_single);
}
return 0;
}

View File

@@ -2133,9 +2133,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
if (*lvalp > UINT_MAX)
return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
*negp = false;
*lvalp = (unsigned long)val;
}
return 0;

View File

@@ -4826,9 +4826,9 @@ static __init int test_ringbuffer(void)
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
"rbtester/%d", cpu);
if (WARN_ON(!rb_threads[cpu])) {
if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
ret = -1;
ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
@@ -4838,9 +4838,9 @@ static __init int test_ringbuffer(void)
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
if (WARN_ON(!rb_hammer)) {
if (WARN_ON(IS_ERR(rb_hammer))) {
pr_cont("FAILED\n");
ret = -1;
ret = PTR_ERR(rb_hammer);
goto out_free;
}